IMPORTAMOS LAS LIBRERÍAS
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)
FUNCIONES
def plot_feature(df, col_name, isContinuous, target):
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def dame_variables_categoricas(dataset=None):
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if dataset[i].dtype == object:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
if dataset[i].dtype == int:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 10:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_loan[i].isnull().sum()>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def cramers_v(confusion_matrix):
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
Con estas funciones estamos tratando de hacer varias cosas:
plot_feature: va a generar gráficos específicos para cada variable donde hará un histograma para analizar la distribución y boxplot para ver cómo se distribuyen los valores en base a la variable TARGET para las variables contínuas. Para las variables categóricas va a crear dos gráficos de barras, para la frecuencia de la variable y otro con las proporciones por categoría de TARGET
dame_variables_categoricas: trata de identificar variables categóricas y separarlas en dos: aquellas variables que tienen menos de 100 valores únicos (estas pueden ser muy útiles en nuestro análisis), y aquellas que tienen más de 100 registros únicos
get_corr_matrix: para visualizar la matriz de correlación para variables numéricas
get_deviation_of_mean_perc: crea un rango de valores alrededor de la media con la desviación típica y encuentra la proporción de valores dentro y fuera de este intervalo, para así poder ver los valores atípicos y la relación que tienen con la variable TARGET.
get_percent_values_target: es para analizar la relación que hay entre valores nulos y la variable TARGET.
cramers_v: para ver la relación entre variables categóricas usando la estadística de Cramer
LECTURA DE DATOS DEL PREPROCESADO INICIAL
path_folder = "../data/"
pd_loan = pd.read_csv(path_folder + "application_data.csv", low_memory = False).set_index('SK_ID_CURR')
pd_loan.head()
| TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Utilizamos el ID de cliente como el índice del Dataframe. A continuación, identificamos las variables continuas y categóricas dentro del conjunto de datos
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
- Descripción:
Esta función recibe un DataFrame una lista de las variables categóricas
(con pocos valores únicos).
- Inputs:
- dataset (DataFrame): DataFrame que contiene los datos de entrada.
- Return:
- lista_variables_categoricas (list): Lista con los nombres de las variables
categóricas en el DataFrame.
- other (list): Lista con los nombres de las variables que no cumplen los criterios
para ser categóricas.
- 1: Indica que la ejecución es incorrecta debido a la falta del
argumento 'dataset'.
'''
# Verificar que el DataFrame de entrada no sea nulo
if dataset is None:
print(u'\nError: Falta el argumento dataset en la función')
return 1
lista_variables_categoricas = []
other = []
# Recorrer las columnas del DataFrame
for i in dataset.columns:
# Si la columna es de tipo objeto (posiblemente categórica)
if dataset[i].dtype == object:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
# Si la columna es de tipo entero
if dataset[i].dtype == int:
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 10:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
import numpy as np
#Aquí identificamos las variables categóricas
list_var_cat, other = dame_variables_categoricas(dataset=pd_loan)
pd_loan[list_var_cat] = pd_loan[list_var_cat].astype("category")
#Seleccionamos las columnas que tienen datos numéricos continuos
list_var_continuous = list(pd_loan.select_dtypes('float').columns)
pd_loan[list_var_continuous] = pd_loan[list_var_continuous].astype(float)
pd_loan.dtypes
pd_loan.dtypes.to_dict()
{'TARGET': dtype('int64'),
'NAME_CONTRACT_TYPE': CategoricalDtype(categories=['Cash loans', 'Revolving loans'], ordered=False, categories_dtype=object),
'CODE_GENDER': CategoricalDtype(categories=['F', 'M', 'XNA'], ordered=False, categories_dtype=object),
'FLAG_OWN_CAR': CategoricalDtype(categories=['N', 'Y'], ordered=False, categories_dtype=object),
'FLAG_OWN_REALTY': CategoricalDtype(categories=['N', 'Y'], ordered=False, categories_dtype=object),
'CNT_CHILDREN': dtype('int64'),
'AMT_INCOME_TOTAL': dtype('float64'),
'AMT_CREDIT': dtype('float64'),
'AMT_ANNUITY': dtype('float64'),
'AMT_GOODS_PRICE': dtype('float64'),
'NAME_TYPE_SUITE': CategoricalDtype(categories=['Children', 'Family', 'Group of people', 'Other_A',
'Other_B', 'Spouse, partner', 'Unaccompanied'],
, ordered=False, categories_dtype=object),
'NAME_INCOME_TYPE': CategoricalDtype(categories=['Businessman', 'Commercial associate', 'Maternity leave',
'Pensioner', 'State servant', 'Student', 'Unemployed',
'Working'],
, ordered=False, categories_dtype=object),
'NAME_EDUCATION_TYPE': CategoricalDtype(categories=['Academic degree', 'Higher education', 'Incomplete higher',
'Lower secondary', 'Secondary / secondary special'],
, ordered=False, categories_dtype=object),
'NAME_FAMILY_STATUS': CategoricalDtype(categories=['Civil marriage', 'Married', 'Separated',
'Single / not married', 'Unknown', 'Widow'],
, ordered=False, categories_dtype=object),
'NAME_HOUSING_TYPE': CategoricalDtype(categories=['Co-op apartment', 'House / apartment',
'Municipal apartment', 'Office apartment',
'Rented apartment', 'With parents'],
, ordered=False, categories_dtype=object),
'REGION_POPULATION_RELATIVE': dtype('float64'),
'DAYS_BIRTH': dtype('int64'),
'DAYS_EMPLOYED': dtype('int64'),
'DAYS_REGISTRATION': dtype('float64'),
'DAYS_ID_PUBLISH': dtype('int64'),
'OWN_CAR_AGE': dtype('float64'),
'FLAG_MOBIL': dtype('int64'),
'FLAG_EMP_PHONE': dtype('int64'),
'FLAG_WORK_PHONE': dtype('int64'),
'FLAG_CONT_MOBILE': dtype('int64'),
'FLAG_PHONE': dtype('int64'),
'FLAG_EMAIL': dtype('int64'),
'OCCUPATION_TYPE': CategoricalDtype(categories=['Accountants', 'Cleaning staff', 'Cooking staff',
'Core staff', 'Drivers', 'HR staff', 'High skill tech staff',
'IT staff', 'Laborers', 'Low-skill Laborers', 'Managers',
'Medicine staff', 'Private service staff', 'Realty agents',
'Sales staff', 'Secretaries', 'Security staff',
'Waiters/barmen staff'],
, ordered=False, categories_dtype=object),
'CNT_FAM_MEMBERS': dtype('float64'),
'REGION_RATING_CLIENT': dtype('int64'),
'REGION_RATING_CLIENT_W_CITY': dtype('int64'),
'WEEKDAY_APPR_PROCESS_START': CategoricalDtype(categories=['FRIDAY', 'MONDAY', 'SATURDAY', 'SUNDAY', 'THURSDAY',
'TUESDAY', 'WEDNESDAY'],
, ordered=False, categories_dtype=object),
'HOUR_APPR_PROCESS_START': dtype('int64'),
'REG_REGION_NOT_LIVE_REGION': dtype('int64'),
'REG_REGION_NOT_WORK_REGION': dtype('int64'),
'LIVE_REGION_NOT_WORK_REGION': dtype('int64'),
'REG_CITY_NOT_LIVE_CITY': dtype('int64'),
'REG_CITY_NOT_WORK_CITY': dtype('int64'),
'LIVE_CITY_NOT_WORK_CITY': dtype('int64'),
'ORGANIZATION_TYPE': CategoricalDtype(categories=['Advertising', 'Agriculture', 'Bank',
'Business Entity Type 1', 'Business Entity Type 2',
'Business Entity Type 3', 'Cleaning', 'Construction',
'Culture', 'Electricity', 'Emergency', 'Government', 'Hotel',
'Housing', 'Industry: type 1', 'Industry: type 10',
'Industry: type 11', 'Industry: type 12',
'Industry: type 13', 'Industry: type 2', 'Industry: type 3',
'Industry: type 4', 'Industry: type 5', 'Industry: type 6',
'Industry: type 7', 'Industry: type 8', 'Industry: type 9',
'Insurance', 'Kindergarten', 'Legal Services', 'Medicine',
'Military', 'Mobile', 'Other', 'Police', 'Postal', 'Realtor',
'Religion', 'Restaurant', 'School', 'Security',
'Security Ministries', 'Self-employed', 'Services',
'Telecom', 'Trade: type 1', 'Trade: type 2', 'Trade: type 3',
'Trade: type 4', 'Trade: type 5', 'Trade: type 6',
'Trade: type 7', 'Transport: type 1', 'Transport: type 2',
'Transport: type 3', 'Transport: type 4', 'University',
'XNA'],
, ordered=False, categories_dtype=object),
'EXT_SOURCE_1': dtype('float64'),
'EXT_SOURCE_2': dtype('float64'),
'EXT_SOURCE_3': dtype('float64'),
'APARTMENTS_AVG': dtype('float64'),
'BASEMENTAREA_AVG': dtype('float64'),
'YEARS_BEGINEXPLUATATION_AVG': dtype('float64'),
'YEARS_BUILD_AVG': dtype('float64'),
'COMMONAREA_AVG': dtype('float64'),
'ELEVATORS_AVG': dtype('float64'),
'ENTRANCES_AVG': dtype('float64'),
'FLOORSMAX_AVG': dtype('float64'),
'FLOORSMIN_AVG': dtype('float64'),
'LANDAREA_AVG': dtype('float64'),
'LIVINGAPARTMENTS_AVG': dtype('float64'),
'LIVINGAREA_AVG': dtype('float64'),
'NONLIVINGAPARTMENTS_AVG': dtype('float64'),
'NONLIVINGAREA_AVG': dtype('float64'),
'APARTMENTS_MODE': dtype('float64'),
'BASEMENTAREA_MODE': dtype('float64'),
'YEARS_BEGINEXPLUATATION_MODE': dtype('float64'),
'YEARS_BUILD_MODE': dtype('float64'),
'COMMONAREA_MODE': dtype('float64'),
'ELEVATORS_MODE': dtype('float64'),
'ENTRANCES_MODE': dtype('float64'),
'FLOORSMAX_MODE': dtype('float64'),
'FLOORSMIN_MODE': dtype('float64'),
'LANDAREA_MODE': dtype('float64'),
'LIVINGAPARTMENTS_MODE': dtype('float64'),
'LIVINGAREA_MODE': dtype('float64'),
'NONLIVINGAPARTMENTS_MODE': dtype('float64'),
'NONLIVINGAREA_MODE': dtype('float64'),
'APARTMENTS_MEDI': dtype('float64'),
'BASEMENTAREA_MEDI': dtype('float64'),
'YEARS_BEGINEXPLUATATION_MEDI': dtype('float64'),
'YEARS_BUILD_MEDI': dtype('float64'),
'COMMONAREA_MEDI': dtype('float64'),
'ELEVATORS_MEDI': dtype('float64'),
'ENTRANCES_MEDI': dtype('float64'),
'FLOORSMAX_MEDI': dtype('float64'),
'FLOORSMIN_MEDI': dtype('float64'),
'LANDAREA_MEDI': dtype('float64'),
'LIVINGAPARTMENTS_MEDI': dtype('float64'),
'LIVINGAREA_MEDI': dtype('float64'),
'NONLIVINGAPARTMENTS_MEDI': dtype('float64'),
'NONLIVINGAREA_MEDI': dtype('float64'),
'FONDKAPREMONT_MODE': CategoricalDtype(categories=['not specified', 'org spec account', 'reg oper account',
'reg oper spec account'],
, ordered=False, categories_dtype=object),
'HOUSETYPE_MODE': CategoricalDtype(categories=['block of flats', 'specific housing', 'terraced house'], ordered=False, categories_dtype=object),
'TOTALAREA_MODE': dtype('float64'),
'WALLSMATERIAL_MODE': CategoricalDtype(categories=['Block', 'Mixed', 'Monolithic', 'Others', 'Panel',
'Stone, brick', 'Wooden'],
, ordered=False, categories_dtype=object),
'EMERGENCYSTATE_MODE': CategoricalDtype(categories=['No', 'Yes'], ordered=False, categories_dtype=object),
'OBS_30_CNT_SOCIAL_CIRCLE': dtype('float64'),
'DEF_30_CNT_SOCIAL_CIRCLE': dtype('float64'),
'OBS_60_CNT_SOCIAL_CIRCLE': dtype('float64'),
'DEF_60_CNT_SOCIAL_CIRCLE': dtype('float64'),
'DAYS_LAST_PHONE_CHANGE': dtype('float64'),
'FLAG_DOCUMENT_2': dtype('int64'),
'FLAG_DOCUMENT_3': dtype('int64'),
'FLAG_DOCUMENT_4': dtype('int64'),
'FLAG_DOCUMENT_5': dtype('int64'),
'FLAG_DOCUMENT_6': dtype('int64'),
'FLAG_DOCUMENT_7': dtype('int64'),
'FLAG_DOCUMENT_8': dtype('int64'),
'FLAG_DOCUMENT_9': dtype('int64'),
'FLAG_DOCUMENT_10': dtype('int64'),
'FLAG_DOCUMENT_11': dtype('int64'),
'FLAG_DOCUMENT_12': dtype('int64'),
'FLAG_DOCUMENT_13': dtype('int64'),
'FLAG_DOCUMENT_14': dtype('int64'),
'FLAG_DOCUMENT_15': dtype('int64'),
'FLAG_DOCUMENT_16': dtype('int64'),
'FLAG_DOCUMENT_17': dtype('int64'),
'FLAG_DOCUMENT_18': dtype('int64'),
'FLAG_DOCUMENT_19': dtype('int64'),
'FLAG_DOCUMENT_20': dtype('int64'),
'FLAG_DOCUMENT_21': dtype('int64'),
'AMT_REQ_CREDIT_BUREAU_HOUR': dtype('float64'),
'AMT_REQ_CREDIT_BUREAU_DAY': dtype('float64'),
'AMT_REQ_CREDIT_BUREAU_WEEK': dtype('float64'),
'AMT_REQ_CREDIT_BUREAU_MON': dtype('float64'),
'AMT_REQ_CREDIT_BUREAU_QRT': dtype('float64'),
'AMT_REQ_CREDIT_BUREAU_YEAR': dtype('float64')}
Algunas variables clasificadas inicialmente como enteras son de carácter booleano, lo que las hace categóricas.
SEPARACIÓN EN TRAIN Y TEST ESTRATIFICADO
pd_plot_target = pd_loan['TARGET'].value_counts(normalize=False).reset_index()
pd_plot_target.columns = ['TARGET', 'count']
fig = px.bar(pd_plot_target, x='TARGET', y='count')
fig.update_xaxes(tickvals=[0, 1])
fig.show()
Se observa que el muestreo está desbalanceado, ya que la mayoría de los clientes no tienen dificultades para pagar el préstamo. (0 = pagado y 1 = no pagado) El 20% de los datos se asignan al conjunto de prueba y el 80% al conjunto de entrenamiento para entrenar el modelo.
from sklearn.model_selection import train_test_split
X_pd_loan, X_pd_loan_test, y_pd_loan, y_pd_loan_test = train_test_split(pd_loan.drop('TARGET',axis=1),
pd_loan['TARGET'],
stratify=pd_loan['TARGET'],
test_size=0.2)
pd_loan_train = pd.concat([X_pd_loan, y_pd_loan],axis=1)
pd_loan_test = pd.concat([X_pd_loan_test, y_pd_loan_test],axis=1)
print('== Train\n', pd_loan_train['TARGET'].value_counts(normalize=True))
print('== Test\n', pd_loan_test['TARGET'].value_counts(normalize=True))
== Train TARGET 0 0.919271 1 0.080729 Name: proportion, dtype: float64 == Test TARGET 0 0.919272 1 0.080728 Name: proportion, dtype: float64
Como observamos en el resultado, cada categoría de la variable target tiene la misma proporción tanto en el train como en el test, lo que permite que el modelo tenga ejemplos de todas las clases en ambos conjuntos y asi evita que el modelo tenga un sesgo hacia una clase en particular.
VISUALIZACIÓN DESCRIPTIVA DE LOS DATOS
Aquí vamos a realizar la suma de la cantidad de valores nulos que hay por filas y por columnas.
pd_null_columnas contiene el número de valores nulos por columna y el % de valores nulos respecto al total de filas. pd_null_filas contiene el número de valores nulos por filas y el porcentaje de valores nulos respecto al total de columnas, y la relación con la variable TARGET
#Veo el número de valores nulos por filas y columnas
pd_series_null_columns = pd_loan_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = pd_loan_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)
pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])
pd_null_filas['TARGET'] = pd_loan['TARGET'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/pd_loan_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/pd_loan_train.shape[1]
(121,) (246008,)
pd_null_columnas.head()
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| COMMONAREA_MODE | 171913 | 0.698811 |
| COMMONAREA_MEDI | 171913 | 0.698811 |
| COMMONAREA_AVG | 171913 | 0.698811 |
| NONLIVINGAPARTMENTS_MEDI | 170833 | 0.694421 |
| NONLIVINGAPARTMENTS_MODE | 170833 | 0.694421 |
pd_null_filas.head()
| nulos_filas | TARGET | porcentaje_filas | |
|---|---|---|---|
| SK_ID_CURR | |||
| 274127 | 61 | 0 | 0.504132 |
| 255145 | 61 | 0 | 0.504132 |
| 412312 | 61 | 0 | 0.504132 |
| 329262 | 61 | 0 | 0.504132 |
| 180861 | 61 | 0 | 0.504132 |
Hemos hecho un conteo y porcentaje de valores nulos por filas y columnas para identificar qué columnas y filas tienen más valores nulos por si hubiese que tomar la decisión de imputarlas o eliminarlas
list_var_cat
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
#Realizamos un bucle para las variables categóricas
for variable in list_var_cat:
if variable != "TARGET":
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Gráfico de conteo
sns.countplot(data=pd_loan_train, x=variable, ax=axes[0])
axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
axes[0].set_ylabel('Count')
# Rotar etiquetas del eje X
axes[0].tick_params(axis='x', rotation=45)
# Para cada categoría: gráfico de fracciones
pd_loan_df = pd_loan_train.groupby([variable, 'TARGET']).size().reset_index(name='count')
pd_loan_df['fraction'] = pd_loan_df['count'] / pd_loan_df.groupby(variable)['count'].transform('sum')
sns.barplot(data=pd_loan_df, x=variable, y='fraction', hue='TARGET', ax=axes[1])
axes[1].set_title(f'{variable} vs TARGET')
axes[1].set_ylabel('TARGET fraction')
# Rotar etiquetas del eje X
axes[1].tick_params(axis='x', rotation=45)
# Ajustar diseño
plt.tight_layout()
plt.show()
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\Guille\AppData\Local\Temp\ipykernel_15832\1351463941.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
A partir de estos gráficos, podemos ver a la izquierda el número de valores nulos para cada categoría de las variables, y en el de la derecha, cómo se distribuye una variable en función de la variable TARGET. Por lo general, vemos que los datos están bastante desbalanceados, pues hay muchas variables que no vemos que tengan un impacto fuerte en el impago del préstamo. A continuación, vamos a comentar algunas variables que puedan ser significativas en el impago del préstamo.
NAME_INCOME_TYPE (Situación laboral del solicitante) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con 'Working', es decir, personas que tienen un empleo. Otras categorías presentan también una parte de los datos, como 'Pensioner' o 'Commercial Associate'. Otras categorías como 'Student' o 'Unemployed' presentan muy poca distribución de los datos, o ninguna. En el gráfico de la derecha, que muestra la fracción de incumplimiento para cada categoría, vemos que las personas desempleadas o las que están de baja por maternidad, tienen un porcentaje alto de incumplimiento. Los desempleados parece lógico, pues se asocia con la falta de un trabajo estable. Otras categorías como los pensionistas o los trabajadores presentan un grado bajo de incumplimiento, pues están asociados con un salario o pensión.
NAME_EDUCATION_TYPE(Nivel de educación) En el gráfico de la izquierda vemos que los datos se distribuyen sobre todo en clientes que tienen 'Secondary/Secondary Special', seguidos de 'Higher Education'. 'Academid degree' o 'Lower Secundary' son menos comunes.En el gráfico de la derecha se muestra las fracciones de incumplimiento para cada nivel educativo. Como vemos, secundaria, secundaria especial o secundaria baja son los niveles que más probabilidad de incumplimiento tienen, algo que puede estar relacionado por el tipo de empleo que estos clientes tienen, asociado con la baja formación académica recibida. Los clientes con un grado o educación superior presentan menor riesgo de incumplimiento, algo que también puede estar relacionado por el caso contrario al anterior.Podemos concluir que el nivel educativo influye en la capacidad de incumplimiento del solicitante.
NAME_HOUSING_TYPE(Tipo de vivienda) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con clientes que vivene un apartameto o casa, que por otra parte es lo más normal. Otras categorías presentan una distribución de los datos mucho menor, como 'Municipal Apartment', 'With Parents' o 'Rented Apartment'. En cuanto a su relación con TARGET, vemos que la fracción de incumplimineto es algo mayor para aquellos solicitantes que viven con los padres o de alquiler. Esto puede estar relacionado con un menor grado de cumplimineto, pues los que viven con sus padres es posible que no tengan una ocupación, aunque también pueden ser aún jóvenes para independizarse. Aun así, no parece una variable muy significativa en cuanto al grado de incumplimiento del préstamo.
OCCUPATYON_TYPE (Tipo de trabajo) Al observar la variable, vemos que presenta una buena cantidad de nulos que podríamos imputar para mejorar el análisis. En el gráfico de la izquierda, vemos que la mayoría de los solicitantes pertenecen a 'Laborers'(en mayor proporción), 'Core Staff' y 'Sales Staff'. Como vemos en el gráfico de la derecha, los trabajadores dedicados a actividades manuales o físicas como la construcción, fabricación, agaricultura, así como aquellos con una baja especialización, como por ejemplo, ayudante de obra, presentan mayor incumplimineto de devolver el préstamo. Otros gremios, como los conductores o los vigilantes de seguridad presentan un grado de incumplimiento menor, aunque mayor que el resto. Esto puede estar debido a la formación educativa, ya que estos trabajos no requieren de una cualificación para dedicarse a ellos. Podemos concluir que el tipo de ocupación puede influir en el grado de incumplimiento del préstamo.
CODE_GENDER(Género) Como vemos en el gráfico de la izquierda, hay mayor proporción de clientes femeninos que masculinos.En el gráfico de la derecha no parece haber una diferencia extremadamente significativa entre el grado de incumplimineto de hombres y mujeres, aunque si nos fijamos el grado de incumplimiento mayor es ligeramente masculino. Esto puede deberse a muchas razones como el tipo de trabajo, mentalidad o tipo de educación, entre otras. En conclusión, esta variable tiene una relación moderada con la probabilidad de incumplkimiento, siendo importante considerar cómoa actúa con otras variables.
NAME_CONTRACT_TYPE (Tipo de contrato) En el gráfico de la izquierda vemos que la mayoría de los préstamos pertenecen a 'Cash Loans', mientras que los 'Revolving Loans' tienen una proproción mucho menor. Con respecto a la variable TARGET, vemos que no hay una gran diferencia en la probabilidad de incumplimiento entre ambos préstamos, pero sí que es verdad que los 'Cash Loans' tienen una probabilidad mayor, cosa que tiene sentido al ser mucho más comunes en el día a día.
for variable in list_var_continuous: # Lista de variables continuas
if variable != "TARGET":
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Histograma
sns.histplot(data=pd_loan_train, x=variable, kde=True, ax=axes[0], bins=30)
axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
axes[0].set_ylabel('Frequency')
# Boxplot vs TARGET
sns.boxplot(data=pd_loan_train, x='TARGET', y=variable, ax=axes[1])
axes[1].set_title(f'{variable} vs TARGET')
axes[1].set_ylabel(variable)
axes[1].set_xlabel('TARGET')
# Ajustar diseño
plt.tight_layout()
plt.show()
Una vez realizados los gráficos de las variables continuas con la variable objetivo (TARGET), procederemos a realizar algunos comentarios de algunas variables que puedan influir en la misma, así como algunas que no sean tan significativas para tener un ejemplo de todo. A continuación, veremos cuáles son las variables elegidas.
OWN_CAR_AGE (Edad del coche propio). Como vemos en el gráfico de la izquierda, hay un número considerable de valores nulos, concretamente 162.426, lo que indica que muchas personas no poseen automóvil o no hay información a cerca de ellos. Esto se debería gestionar mediante, por ejemplo, con la imputación o creando una categoría específica para los que no tienen automóvil. La mayoría de los coches tienen entre 0 y 20 años, con una concentración importante sobre todo de 0 a 10. No hay casi valores más allá de los 40 años, solo unos pocos datos sobrepasan los 60 años. En cuanto a su relación con TARGET, vemos en los boxplots que no hay grandes diferencias en la mediana ni en el rango intercuartílico para 0 y para 1, lo que nos puede indicar que no el coche propio puede no ser un factor influyente en cuanto a devolver o no el crédito.
EXT_SOURCE_1 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas). En el gráfico de la izquierda se puede observar que la variable sigue una distribución simétrica con forma similar a una distribución normal. Esta variable tiene una elevada cantidad de valores lo que indica que puede ser útil para discriminar entre distintos tipos de solicitantes. En cuanto a su relación con TARGET, observamos que el boxplot muestra que la mediana para los clientes que no incumplen (TARGET=0) es algo mayor que la de los que sí incumplen (TARGET=1) tienden a tener valores más altos, lo que puede decir que esta variable está asociada con un menor riesgo de incumplimiento.El rango intercuartílico es algo mayor también en los clientes cumplidores que en los que no. Podemos decir que esta variable puede tener un cierto poder predictivo.Los valores altos de EXT_SOURCE_1 se asocian con un menor riesgo de incumplimiento.
CNT_FAM_MEMBERS (Numero de miembros de la familia). En el gráfico de la izquierda vemos que la variable posee 2 valores nulos, algo que no es significativo dentro del total de datos. En el histograma se muestra que la mayoría de los datos se encuentran entre 2 y 4 miembros, concentrándose sobre todo en 2, sugiriendo que una gran parte de las familias son pequeñas. En relación con TARGET, el boxplot muestra que la distribución es similiar tanto para los cumplidores como para los que no. Tanto la mediana como el rango intercuartílico son iguales, lo que indica que no hay una gran relación con la probabilidad de impago. Hay algunas familias con un número elevado de miembros, aunque no parece que tengan un efecto significativo.
AMT_INCOME_TOTAL (Ingresos totales). Esta variable no posee valores nulos, por lo que podría ser representativa. En cuanto al histograma, la distribución de la variable está sesgada hacia la izquierda, lo que indica que la mayoría de clientes tienen ingresos bajos. La distribuación de los ingresos para TARGET=0 y TARGET=1 es muy similar, La mediana y el rango intercuartílico son también casi iguales, lo que indica que el nivel de ingresos no tiene una relación directa con la posibilidad de impago.
EXT_SOURCE_2 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) En este caso, presenta pocos valores nulos con respecto al total de los datos, aunque se podrían imputar para no perder demasiada información. En el histograma vemos que la mayoría de los valores están concentrados entre 0.4 y 0.8, es decir, sesgados a la derecha, mientras que los valores extremos son menos comunes.En cuanto a su relación con TARGET, vemos que los valores más altos están relacionados con los clientes que no incumplen. La mediana es mas alta para los clientes que no incumplen, lo que un valor más alto de la variable podría estar asociado con un menor riesgo de incumplimiento. El rango intercuartílico es bastante parecido, aunque se observa que los valores bajos se concentran más en los clientes que presentan incumplimiento, por lo que podemos decir que esta variable puede ser un buen predictor.
EXT_SOURCE_3 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) Al igual que en las variables anteriores como EXT_SOURCE_1 y EXT_SOURCE_2, prsenta los valores más altos asocidados a clientes que no incumplen el préstamo, lo que indica que valores bajos poseen un mayor riesgo de incumplimiento. Por lo tanto, podemos concluir que la información externa sobre clientes solicitantes de préstamo es una buena variable predictora.
TRATAMIENTO DE VARIABLES CONTINUAS
A continuación se tratarán tres aspectos clave del análisis de los datos: los valores missing, los outliers y las correlaciones entre las variables continuas
list_var_continuous
['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
'''
----------------------------------------------------------------------------------------------------------
Función get_deviation_of_mean_perc:
----------------------------------------------------------------------------------------------------------
- Descripción:
Esta función calcula el porcentaje de valores que se encuentran fuera de un
intervalo de confianza, determinado por la media y una desviación estándar
multiplicada por un factor (multiplier), para cada variable continua en el
DataFrame. Luego, analiza la relación entre estos valores atípicos y la variable
objetivo (TARGET), y devuelve un resumen con los porcentajes de valores atípicos
y su distribución en relación con la variable objetivo.
- Inputs:
- pd_loan (DataFrame): DataFrame que contiene los datos de entrada.
- list_var_continuous (list): Lista con los nombres de las variables continuas
a analizar.
- target (str): Nombre de la variable objetivo en el DataFrame.
- multiplier (float): Factor multiplicador para calcular el intervalo de confianza
(desviación estándar).
- Return:
- pd_final (DataFrame): DataFrame que contiene el porcentaje de valores atípicos
por cada variable continua, su distribución con respecto a la variable objetivo
(TARGET), y otros detalles relevantes.
'''
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
get_deviation_of_mean_perc(pd_loan_train, list_var_continuous, target='TARGET', multiplier=3)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.000000 | 1.000000 | AMT_INCOME_TOTAL | 2187 | 0.008890 |
| 1 | 0.946502 | 0.053498 | AMT_INCOME_TOTAL | 2187 | 0.008890 |
| 2 | 0.000000 | 1.000000 | AMT_CREDIT | 2596 | 0.010553 |
| 3 | 0.957242 | 0.042758 | AMT_CREDIT | 2596 | 0.010553 |
| 4 | 0.000000 | 1.000000 | AMT_ANNUITY | 2356 | 0.009577 |
| 5 | 0.963497 | 0.036503 | AMT_ANNUITY | 2356 | 0.009577 |
| 6 | 0.000000 | 1.000000 | AMT_GOODS_PRICE | 3298 | 0.013406 |
| 7 | 0.960279 | 0.039721 | AMT_GOODS_PRICE | 3298 | 0.013406 |
| 8 | 0.000000 | 1.000000 | REGION_POPULATION_RELATIVE | 6698 | 0.027227 |
| 9 | 0.961033 | 0.038967 | REGION_POPULATION_RELATIVE | 6698 | 0.027227 |
| 10 | 0.000000 | 1.000000 | DAYS_REGISTRATION | 608 | 0.002471 |
| 11 | 0.960526 | 0.039474 | DAYS_REGISTRATION | 608 | 0.002471 |
| 12 | 0.000000 | 1.000000 | OWN_CAR_AGE | 2664 | 0.010829 |
| 13 | 0.920420 | 0.079580 | OWN_CAR_AGE | 2664 | 0.010829 |
| 14 | 0.000000 | 1.000000 | CNT_FAM_MEMBERS | 3205 | 0.013028 |
| 15 | 0.898284 | 0.101716 | CNT_FAM_MEMBERS | 3205 | 0.013028 |
| 16 | 0.000000 | 1.000000 | APARTMENTS_AVG | 2345 | 0.009532 |
| 17 | 0.947548 | 0.052452 | APARTMENTS_AVG | 2345 | 0.009532 |
| 18 | 0.000000 | 1.000000 | BASEMENTAREA_AVG | 1587 | 0.006451 |
| 19 | 0.943289 | 0.056711 | BASEMENTAREA_AVG | 1587 | 0.006451 |
| 20 | 0.000000 | 1.000000 | YEARS_BEGINEXPLUATATION_AVG | 553 | 0.002248 |
| 21 | 0.905967 | 0.094033 | YEARS_BEGINEXPLUATATION_AVG | 553 | 0.002248 |
| 22 | 0.000000 | 1.000000 | YEARS_BUILD_AVG | 947 | 0.003849 |
| 23 | 0.927138 | 0.072862 | YEARS_BUILD_AVG | 947 | 0.003849 |
| 24 | 0.000000 | 1.000000 | COMMONAREA_AVG | 1364 | 0.005545 |
| 25 | 0.949413 | 0.050587 | COMMONAREA_AVG | 1364 | 0.005545 |
| 26 | 0.000000 | 1.000000 | ELEVATORS_AVG | 1899 | 0.007719 |
| 27 | 0.953133 | 0.046867 | ELEVATORS_AVG | 1899 | 0.007719 |
| 28 | 0.000000 | 1.000000 | ENTRANCES_AVG | 1746 | 0.007097 |
| 29 | 0.935281 | 0.064719 | ENTRANCES_AVG | 1746 | 0.007097 |
| 30 | 0.000000 | 1.000000 | FLOORSMAX_AVG | 2051 | 0.008337 |
| 31 | 0.958557 | 0.041443 | FLOORSMAX_AVG | 2051 | 0.008337 |
| 32 | 0.000000 | 1.000000 | FLOORSMIN_AVG | 474 | 0.001927 |
| 33 | 0.970464 | 0.029536 | FLOORSMIN_AVG | 474 | 0.001927 |
| 34 | 0.000000 | 1.000000 | LANDAREA_AVG | 1672 | 0.006797 |
| 35 | 0.935407 | 0.064593 | LANDAREA_AVG | 1672 | 0.006797 |
| 36 | 0.000000 | 1.000000 | LIVINGAPARTMENTS_AVG | 1376 | 0.005593 |
| 37 | 0.946948 | 0.053052 | LIVINGAPARTMENTS_AVG | 1376 | 0.005593 |
| 38 | 0.000000 | 1.000000 | LIVINGAREA_AVG | 2534 | 0.010300 |
| 39 | 0.946725 | 0.053275 | LIVINGAREA_AVG | 2534 | 0.010300 |
| 40 | 0.000000 | 1.000000 | NONLIVINGAPARTMENTS_AVG | 577 | 0.002345 |
| 41 | 0.925477 | 0.074523 | NONLIVINGAPARTMENTS_AVG | 577 | 0.002345 |
| 42 | 0.000000 | 1.000000 | NONLIVINGAREA_AVG | 1907 | 0.007752 |
| 43 | 0.947562 | 0.052438 | NONLIVINGAREA_AVG | 1907 | 0.007752 |
| 44 | 0.000000 | 1.000000 | APARTMENTS_MODE | 2346 | 0.009536 |
| 45 | 0.945865 | 0.054135 | APARTMENTS_MODE | 2346 | 0.009536 |
| 46 | 0.000000 | 1.000000 | BASEMENTAREA_MODE | 1648 | 0.006699 |
| 47 | 0.941141 | 0.058859 | BASEMENTAREA_MODE | 1648 | 0.006699 |
| 48 | 0.000000 | 1.000000 | YEARS_BEGINEXPLUATATION_MODE | 542 | 0.002203 |
| 49 | 0.904059 | 0.095941 | YEARS_BEGINEXPLUATATION_MODE | 542 | 0.002203 |
| 50 | 0.000000 | 1.000000 | YEARS_BUILD_MODE | 955 | 0.003882 |
| 51 | 0.924607 | 0.075393 | YEARS_BUILD_MODE | 955 | 0.003882 |
| 52 | 0.000000 | 1.000000 | COMMONAREA_MODE | 1357 | 0.005516 |
| 53 | 0.944731 | 0.055269 | COMMONAREA_MODE | 1357 | 0.005516 |
| 54 | 0.000000 | 1.000000 | ELEVATORS_MODE | 2620 | 0.010650 |
| 55 | 0.949618 | 0.050382 | ELEVATORS_MODE | 2620 | 0.010650 |
| 56 | 0.000000 | 1.000000 | ENTRANCES_MODE | 2084 | 0.008471 |
| 57 | 0.939060 | 0.060940 | ENTRANCES_MODE | 2084 | 0.008471 |
| 58 | 0.000000 | 1.000000 | FLOORSMAX_MODE | 2072 | 0.008422 |
| 59 | 0.960907 | 0.039093 | FLOORSMAX_MODE | 2072 | 0.008422 |
| 60 | 0.000000 | 1.000000 | FLOORSMIN_MODE | 386 | 0.001569 |
| 61 | 0.968912 | 0.031088 | FLOORSMIN_MODE | 386 | 0.001569 |
| 62 | 0.000000 | 1.000000 | LANDAREA_MODE | 1694 | 0.006886 |
| 63 | 0.935065 | 0.064935 | LANDAREA_MODE | 1694 | 0.006886 |
| 64 | 0.000000 | 1.000000 | LIVINGAPARTMENTS_MODE | 1423 | 0.005784 |
| 65 | 0.944483 | 0.055517 | LIVINGAPARTMENTS_MODE | 1423 | 0.005784 |
| 66 | 0.000000 | 1.000000 | LIVINGAREA_MODE | 2658 | 0.010805 |
| 67 | 0.945824 | 0.054176 | LIVINGAREA_MODE | 2658 | 0.010805 |
| 68 | 0.000000 | 1.000000 | NONLIVINGAPARTMENTS_MODE | 544 | 0.002211 |
| 69 | 0.917279 | 0.082721 | NONLIVINGAPARTMENTS_MODE | 544 | 0.002211 |
| 70 | 0.000000 | 1.000000 | NONLIVINGAREA_MODE | 1946 | 0.007910 |
| 71 | 0.948613 | 0.051387 | NONLIVINGAREA_MODE | 1946 | 0.007910 |
| 72 | 0.000000 | 1.000000 | APARTMENTS_MEDI | 2373 | 0.009646 |
| 73 | 0.947324 | 0.052676 | APARTMENTS_MEDI | 2373 | 0.009646 |
| 74 | 0.000000 | 1.000000 | BASEMENTAREA_MEDI | 1592 | 0.006471 |
| 75 | 0.942839 | 0.057161 | BASEMENTAREA_MEDI | 1592 | 0.006471 |
| 76 | 0.000000 | 1.000000 | YEARS_BEGINEXPLUATATION_MEDI | 520 | 0.002114 |
| 77 | 0.901923 | 0.098077 | YEARS_BEGINEXPLUATATION_MEDI | 520 | 0.002114 |
| 78 | 0.000000 | 1.000000 | YEARS_BUILD_MEDI | 958 | 0.003894 |
| 79 | 0.927975 | 0.072025 | YEARS_BUILD_MEDI | 958 | 0.003894 |
| 80 | 0.000000 | 1.000000 | COMMONAREA_MEDI | 1378 | 0.005601 |
| 81 | 0.948476 | 0.051524 | COMMONAREA_MEDI | 1378 | 0.005601 |
| 82 | 0.000000 | 1.000000 | ELEVATORS_MEDI | 2743 | 0.011150 |
| 83 | 0.950055 | 0.049945 | ELEVATORS_MEDI | 2743 | 0.011150 |
| 84 | 0.000000 | 1.000000 | ENTRANCES_MEDI | 1757 | 0.007142 |
| 85 | 0.934548 | 0.065452 | ENTRANCES_MEDI | 1757 | 0.007142 |
| 86 | 0.000000 | 1.000000 | FLOORSMAX_MEDI | 2148 | 0.008731 |
| 87 | 0.959497 | 0.040503 | FLOORSMAX_MEDI | 2148 | 0.008731 |
| 88 | 0.000000 | 1.000000 | FLOORSMIN_MEDI | 441 | 0.001793 |
| 89 | 0.970522 | 0.029478 | FLOORSMIN_MEDI | 441 | 0.001793 |
| 90 | 0.000000 | 1.000000 | LANDAREA_MEDI | 1696 | 0.006894 |
| 91 | 0.936321 | 0.063679 | LANDAREA_MEDI | 1696 | 0.006894 |
| 92 | 0.000000 | 1.000000 | LIVINGAPARTMENTS_MEDI | 1390 | 0.005650 |
| 93 | 0.945324 | 0.054676 | LIVINGAPARTMENTS_MEDI | 1390 | 0.005650 |
| 94 | 0.000000 | 1.000000 | LIVINGAREA_MEDI | 2542 | 0.010333 |
| 95 | 0.946892 | 0.053108 | LIVINGAREA_MEDI | 2542 | 0.010333 |
| 96 | 0.000000 | 1.000000 | NONLIVINGAPARTMENTS_MEDI | 575 | 0.002337 |
| 97 | 0.926957 | 0.073043 | NONLIVINGAPARTMENTS_MEDI | 575 | 0.002337 |
| 98 | 0.000000 | 1.000000 | NONLIVINGAREA_MEDI | 1930 | 0.007845 |
| 99 | 0.948705 | 0.051295 | NONLIVINGAREA_MEDI | 1930 | 0.007845 |
| 100 | 0.000000 | 1.000000 | TOTALAREA_MODE | 2624 | 0.010666 |
| 101 | 0.953506 | 0.046494 | TOTALAREA_MODE | 2624 | 0.010666 |
| 102 | 0.000000 | 1.000000 | OBS_30_CNT_SOCIAL_CIRCLE | 4897 | 0.019906 |
| 103 | 0.908515 | 0.091485 | OBS_30_CNT_SOCIAL_CIRCLE | 4897 | 0.019906 |
| 104 | 0.000000 | 1.000000 | DEF_30_CNT_SOCIAL_CIRCLE | 5444 | 0.022129 |
| 105 | 0.879133 | 0.120867 | DEF_30_CNT_SOCIAL_CIRCLE | 5444 | 0.022129 |
| 106 | 0.000000 | 1.000000 | OBS_60_CNT_SOCIAL_CIRCLE | 4753 | 0.019321 |
| 107 | 0.908689 | 0.091311 | OBS_60_CNT_SOCIAL_CIRCLE | 4753 | 0.019321 |
| 108 | 0.000000 | 1.000000 | DEF_60_CNT_SOCIAL_CIRCLE | 3134 | 0.012739 |
| 109 | 0.873644 | 0.126356 | DEF_60_CNT_SOCIAL_CIRCLE | 3134 | 0.012739 |
| 110 | 0.000000 | 1.000000 | DAYS_LAST_PHONE_CHANGE | 509 | 0.002069 |
| 111 | 0.950884 | 0.049116 | DAYS_LAST_PHONE_CHANGE | 509 | 0.002069 |
| 112 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_HOUR | 1312 | 0.005333 |
| 113 | 0.916159 | 0.083841 | AMT_REQ_CREDIT_BUREAU_HOUR | 1312 | 0.005333 |
| 114 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_DAY | 1206 | 0.004902 |
| 115 | 0.904643 | 0.095357 | AMT_REQ_CREDIT_BUREAU_DAY | 1206 | 0.004902 |
| 116 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_WEEK | 6835 | 0.027784 |
| 117 | 0.921287 | 0.078713 | AMT_REQ_CREDIT_BUREAU_WEEK | 6835 | 0.027784 |
| 118 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_MON | 2588 | 0.010520 |
| 119 | 0.943199 | 0.056801 | AMT_REQ_CREDIT_BUREAU_MON | 2588 | 0.010520 |
| 120 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_QRT | 1850 | 0.007520 |
| 121 | 0.909730 | 0.090270 | AMT_REQ_CREDIT_BUREAU_QRT | 1850 | 0.007520 |
| 122 | 0.000000 | 1.000000 | AMT_REQ_CREDIT_BUREAU_YEAR | 2711 | 0.011020 |
| 123 | 0.909259 | 0.090741 | AMT_REQ_CREDIT_BUREAU_YEAR | 2711 | 0.011020 |
CORRELACIONES
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
''''
----------------------------------------------------------------------------------------------------------
Función get_corr_matrix:
----------------------------------------------------------------------------------------------------------
- Descripción:
Esta función calcula y visualiza la matriz de correlación entre las variables
numéricas de un conjunto de datos.
- Inputs:
- dataset (DataFrame): Conjunto de datos con las variables numéricas a analizar.
- metodo (str): Método de correlación a utilizar.
- size_figure (list): Tamaño de la figura del gráfico.
- Return:
- None: Muestra un mapa de calor de la matriz de correlación.
----------------------------------------------------------------------------------------------------------
'''
# Comprobación de que se ha proporcionado el dataset
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Calcular la matriz de correlación
corr = dataset.corr(method=metodo)
# Establecer la autocorrelación a cero para evitar distracciones
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
f, ax = plt.subplots(figsize=size_figure)
# Dibujar el mapa de calor con la correlación
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
get_corr_matrix(dataset = pd_loan_train[list_var_continuous],
metodo='pearson', size_figure=[10,8])
0
En la matriz de correlación se observan varias correlaciones de Pearson cercanas o iguales a 1, lo que indica que algunas variables están perfectamente correlacionadas. Esto puede deberse a la existencia de variables redundantes o derivadas de otras, lo que podría generar problemas de multicolinealidad en modelos estadísticos. Es recomendable revisar estas variables y, en caso de ser necesario, eliminar las que sean idénticas o altamente correlacionadas para evitar distorsiones en los resultados del modelo.
corr = pd_loan_train[list_var_continuous].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 3694 | OBS_60_CNT_SOCIAL_CIRCLE | OBS_30_CNT_SOCIAL_CIRCLE | 0.998514 |
| 2744 | YEARS_BUILD_MEDI | YEARS_BUILD_AVG | 0.998454 |
| 3074 | FLOORSMIN_MEDI | FLOORSMIN_AVG | 0.997279 |
| 3008 | FLOORSMAX_MEDI | FLOORSMAX_AVG | 0.997015 |
| 2942 | ENTRANCES_MEDI | ENTRANCES_AVG | 0.996893 |
| 2876 | ELEVATORS_MEDI | ELEVATORS_AVG | 0.996207 |
| 2810 | COMMONAREA_MEDI | COMMONAREA_AVG | 0.995471 |
| 3272 | LIVINGAREA_MEDI | LIVINGAREA_AVG | 0.995280 |
| 2546 | APARTMENTS_MEDI | APARTMENTS_AVG | 0.995117 |
| 2612 | BASEMENTAREA_MEDI | BASEMENTAREA_AVG | 0.994780 |
| 3206 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.994544 |
| 2678 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_AVG | 0.994378 |
| 3338 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_AVG | 0.991786 |
| 3140 | LANDAREA_MEDI | LANDAREA_AVG | 0.991740 |
| 3404 | NONLIVINGAREA_MEDI | NONLIVINGAREA_AVG | 0.990674 |
| 2758 | YEARS_BUILD_MEDI | YEARS_BUILD_MODE | 0.989445 |
| 1834 | YEARS_BUILD_MODE | YEARS_BUILD_AVG | 0.989417 |
| 3088 | FLOORSMIN_MEDI | FLOORSMIN_MODE | 0.988434 |
| 3022 | FLOORSMAX_MEDI | FLOORSMAX_MODE | 0.988090 |
| 196 | AMT_GOODS_PRICE | AMT_CREDIT | 0.986929 |
| 2164 | FLOORSMIN_MODE | FLOORSMIN_AVG | 0.985960 |
| 2098 | FLOORSMAX_MODE | FLOORSMAX_AVG | 0.985577 |
| 2890 | ELEVATORS_MEDI | ELEVATORS_MODE | 0.982941 |
| 3154 | LANDAREA_MEDI | LANDAREA_MODE | 0.981883 |
| 2956 | ENTRANCES_MEDI | ENTRANCES_MODE | 0.980671 |
| 2824 | COMMONAREA_MEDI | COMMONAREA_MODE | 0.979752 |
| 1966 | ELEVATORS_MODE | ELEVATORS_AVG | 0.979015 |
| 2032 | ENTRANCES_MODE | ENTRANCES_AVG | 0.977723 |
| 2626 | BASEMENTAREA_MEDI | BASEMENTAREA_MODE | 0.977227 |
| 2560 | APARTMENTS_MEDI | APARTMENTS_MODE | 0.976900 |
| 1900 | COMMONAREA_MODE | COMMONAREA_AVG | 0.976325 |
| 3352 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_MODE | 0.975725 |
| 2230 | LANDAREA_MODE | LANDAREA_AVG | 0.975100 |
| 3418 | NONLIVINGAREA_MEDI | NONLIVINGAREA_MODE | 0.974836 |
| 3220 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.974520 |
| 3286 | LIVINGAREA_MEDI | LIVINGAREA_MODE | 0.974122 |
| 1702 | BASEMENTAREA_MODE | BASEMENTAREA_AVG | 0.973261 |
| 1768 | YEARS_BEGINEXPLUATATION_MODE | YEARS_BEGINEXPLUATATION_AVG | 0.973241 |
| 1636 | APARTMENTS_MODE | APARTMENTS_AVG | 0.973040 |
| 2362 | LIVINGAREA_MODE | LIVINGAREA_AVG | 0.971317 |
| 2296 | LIVINGAPARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.969774 |
| 2428 | NONLIVINGAPARTMENTS_MODE | NONLIVINGAPARTMENTS_AVG | 0.967589 |
| 2494 | NONLIVINGAREA_MODE | NONLIVINGAREA_AVG | 0.965281 |
| 2692 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_MODE | 0.964416 |
| 1376 | LIVINGAPARTMENTS_AVG | APARTMENTS_AVG | 0.943325 |
| 3224 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MEDI | 0.941575 |
| 3196 | LIVINGAPARTMENTS_MEDI | APARTMENTS_AVG | 0.940442 |
| 2300 | LIVINGAPARTMENTS_MODE | APARTMENTS_MODE | 0.936308 |
| 2556 | APARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.935321 |
| 2570 | APARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.930829 |
| 2286 | LIVINGAPARTMENTS_MODE | APARTMENTS_AVG | 0.929717 |
| 3467 | TOTALAREA_MODE | LIVINGAREA_AVG | 0.924992 |
| 3495 | TOTALAREA_MODE | LIVINGAREA_MEDI | 0.919302 |
| 3289 | LIVINGAREA_MEDI | APARTMENTS_MEDI | 0.915742 |
| 1441 | LIVINGAREA_AVG | APARTMENTS_AVG | 0.913186 |
| 3210 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MODE | 0.912645 |
| 2557 | APARTMENTS_MEDI | LIVINGAREA_AVG | 0.912046 |
| 3261 | LIVINGAREA_MEDI | APARTMENTS_AVG | 0.912033 |
| 2365 | LIVINGAREA_MODE | APARTMENTS_MODE | 0.910072 |
| 1646 | APARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.906915 |
| 3481 | TOTALAREA_MODE | LIVINGAREA_MODE | 0.899615 |
| 2571 | APARTMENTS_MEDI | LIVINGAREA_MODE | 0.895640 |
| 3275 | LIVINGAREA_MEDI | APARTMENTS_MODE | 0.893727 |
| 2351 | LIVINGAREA_MODE | APARTMENTS_AVG | 0.893138 |
| 3456 | TOTALAREA_MODE | APARTMENTS_AVG | 0.891911 |
| 1647 | APARTMENTS_MODE | LIVINGAREA_AVG | 0.890055 |
| 3299 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MEDI | 0.885788 |
| 3484 | TOTALAREA_MODE | APARTMENTS_MEDI | 0.885684 |
| 3207 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_AVG | 0.883829 |
| 1451 | LIVINGAREA_AVG | LIVINGAPARTMENTS_AVG | 0.882129 |
| 3271 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.879969 |
| 2375 | LIVINGAREA_MODE | LIVINGAPARTMENTS_MODE | 0.879610 |
| 3285 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.875667 |
| 2297 | LIVINGAPARTMENTS_MODE | LIVINGAREA_AVG | 0.874129 |
| 3294 | LIVINGAREA_MEDI | ELEVATORS_MEDI | 0.867910 |
| 1446 | LIVINGAREA_AVG | ELEVATORS_AVG | 0.867106 |
| 2882 | ELEVATORS_MEDI | LIVINGAREA_AVG | 0.865079 |
| 3266 | LIVINGAREA_MEDI | ELEVATORS_AVG | 0.865038 |
| 3470 | TOTALAREA_MODE | APARTMENTS_MODE | 0.862571 |
| 3760 | DEF_60_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | 0.861250 |
| 3221 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MODE | 0.857528 |
| 2370 | LIVINGAREA_MODE | ELEVATORS_MODE | 0.855572 |
| 3280 | LIVINGAREA_MEDI | ELEVATORS_MODE | 0.855441 |
| 2361 | LIVINGAREA_MODE | LIVINGAPARTMENTS_AVG | 0.852440 |
| 1972 | ELEVATORS_MODE | LIVINGAREA_AVG | 0.851989 |
| 3466 | TOTALAREA_MODE | LIVINGAPARTMENTS_AVG | 0.847788 |
| 3494 | TOTALAREA_MODE | LIVINGAPARTMENTS_MEDI | 0.845127 |
| 3461 | TOTALAREA_MODE | ELEVATORS_AVG | 0.844243 |
| 2896 | ELEVATORS_MEDI | LIVINGAREA_MODE | 0.840698 |
| 2356 | LIVINGAREA_MODE | ELEVATORS_AVG | 0.838606 |
| 3489 | TOTALAREA_MODE | ELEVATORS_MEDI | 0.837394 |
| 2899 | ELEVATORS_MEDI | APARTMENTS_MEDI | 0.836163 |
| 1051 | ELEVATORS_AVG | APARTMENTS_AVG | 0.835854 |
| 2871 | ELEVATORS_MEDI | APARTMENTS_AVG | 0.833887 |
| 2551 | APARTMENTS_MEDI | ELEVATORS_AVG | 0.833254 |
| 3480 | TOTALAREA_MODE | LIVINGAPARTMENTS_MODE | 0.832763 |
| 1975 | ELEVATORS_MODE | APARTMENTS_MODE | 0.824885 |
| 2565 | APARTMENTS_MEDI | ELEVATORS_MODE | 0.824303 |
| 1961 | ELEVATORS_MODE | APARTMENTS_AVG | 0.821279 |
| 3475 | TOTALAREA_MODE | ELEVATORS_MODE | 0.820252 |
| 3229 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MEDI | 0.814055 |
| 3201 | LIVINGAPARTMENTS_MEDI | ELEVATORS_AVG | 0.812459 |
| 1381 | LIVINGAPARTMENTS_AVG | ELEVATORS_AVG | 0.812266 |
| 2881 | ELEVATORS_MEDI | LIVINGAPARTMENTS_AVG | 0.809730 |
| 2885 | ELEVATORS_MEDI | APARTMENTS_MODE | 0.807515 |
| 2305 | LIVINGAPARTMENTS_MODE | ELEVATORS_MODE | 0.807429 |
| 1641 | APARTMENTS_MODE | ELEVATORS_AVG | 0.804603 |
| 3215 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MODE | 0.799160 |
| 2895 | ELEVATORS_MEDI | LIVINGAPARTMENTS_MODE | 0.798714 |
| 2291 | LIVINGAPARTMENTS_MODE | ELEVATORS_AVG | 0.797127 |
| 1971 | ELEVATORS_MODE | LIVINGAPARTMENTS_AVG | 0.794240 |
| 197 | AMT_GOODS_PRICE | AMT_ANNUITY | 0.775038 |
| 131 | AMT_ANNUITY | AMT_CREDIT | 0.769993 |
| 1253 | FLOORSMIN_AVG | FLOORSMAX_AVG | 0.740499 |
| 3101 | FLOORSMIN_MEDI | FLOORSMAX_MEDI | 0.738489 |
| 3009 | FLOORSMAX_MEDI | FLOORSMIN_AVG | 0.738135 |
| 3073 | FLOORSMIN_MEDI | FLOORSMAX_AVG | 0.737900 |
| 3087 | FLOORSMIN_MEDI | FLOORSMAX_MODE | 0.727306 |
| 2099 | FLOORSMAX_MODE | FLOORSMIN_AVG | 0.726754 |
| 2177 | FLOORSMIN_MODE | FLOORSMAX_MODE | 0.723838 |
| 3023 | FLOORSMAX_MEDI | FLOORSMIN_MODE | 0.721085 |
| 2163 | FLOORSMIN_MODE | FLOORSMAX_AVG | 0.720300 |
| 1442 | LIVINGAREA_AVG | BASEMENTAREA_AVG | 0.692622 |
| 2622 | BASEMENTAREA_MEDI | LIVINGAREA_AVG | 0.692100 |
| 3290 | LIVINGAREA_MEDI | BASEMENTAREA_MEDI | 0.691377 |
| 2366 | LIVINGAREA_MODE | BASEMENTAREA_MODE | 0.691004 |
| 3262 | LIVINGAREA_MEDI | BASEMENTAREA_AVG | 0.689465 |
| 2636 | BASEMENTAREA_MEDI | LIVINGAREA_MODE | 0.680749 |
| 2639 | BASEMENTAREA_MEDI | APARTMENTS_MEDI | 0.680550 |
| 1186 | FLOORSMAX_AVG | ELEVATORS_AVG | 0.680214 |
| 791 | BASEMENTAREA_AVG | APARTMENTS_AVG | 0.679687 |
| 1715 | BASEMENTAREA_MODE | APARTMENTS_MODE | 0.679153 |
| 2611 | BASEMENTAREA_MEDI | APARTMENTS_AVG | 0.679040 |
| 2547 | APARTMENTS_MEDI | BASEMENTAREA_AVG | 0.678465 |
| 3006 | FLOORSMAX_MEDI | ELEVATORS_AVG | 0.677866 |
| 2352 | LIVINGAREA_MODE | BASEMENTAREA_AVG | 0.677636 |
| 2878 | ELEVATORS_MEDI | FLOORSMAX_AVG | 0.676166 |
| 3034 | FLOORSMAX_MEDI | ELEVATORS_MEDI | 0.675587 |
| 1712 | BASEMENTAREA_MODE | LIVINGAREA_AVG | 0.673834 |
| 3276 | LIVINGAREA_MEDI | BASEMENTAREA_MODE | 0.673600 |
| 3457 | TOTALAREA_MODE | BASEMENTAREA_AVG | 0.672591 |
| 2096 | FLOORSMAX_MODE | ELEVATORS_AVG | 0.671292 |
| 3485 | TOTALAREA_MODE | BASEMENTAREA_MEDI | 0.669373 |
| 2892 | ELEVATORS_MEDI | FLOORSMAX_MODE | 0.669097 |
| 2625 | BASEMENTAREA_MEDI | APARTMENTS_MODE | 0.668838 |
| 1637 | APARTMENTS_MODE | BASEMENTAREA_AVG | 0.666304 |
| 2561 | APARTMENTS_MEDI | BASEMENTAREA_MODE | 0.663966 |
| 1701 | BASEMENTAREA_MODE | APARTMENTS_AVG | 0.661561 |
| 2110 | FLOORSMAX_MODE | ELEVATORS_MODE | 0.660848 |
| 1968 | ELEVATORS_MODE | FLOORSMAX_AVG | 0.656186 |
| 3020 | FLOORSMAX_MEDI | ELEVATORS_MODE | 0.655573 |
| 2301 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_MODE | 0.654589 |
| 2041 | ENTRANCES_MODE | BASEMENTAREA_MODE | 0.652464 |
| 2635 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.651936 |
| 1707 | BASEMENTAREA_MODE | ENTRANCES_AVG | 0.651788 |
| 3225 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MEDI | 0.651407 |
| 2617 | BASEMENTAREA_MEDI | ENTRANCES_AVG | 0.650922 |
| 2951 | ENTRANCES_MEDI | BASEMENTAREA_MODE | 0.650752 |
| 3471 | TOTALAREA_MODE | BASEMENTAREA_MODE | 0.649395 |
| 2965 | ENTRANCES_MEDI | BASEMENTAREA_MEDI | 0.649370 |
| 1117 | ENTRANCES_AVG | BASEMENTAREA_AVG | 0.649305 |
| 3197 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_AVG | 0.648881 |
| 2287 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_AVG | 0.648028 |
| 1377 | LIVINGAPARTMENTS_AVG | BASEMENTAREA_AVG | 0.647323 |
| 2621 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.647128 |
| 2937 | ENTRANCES_MEDI | BASEMENTAREA_AVG | 0.645062 |
| 3463 | TOTALAREA_MODE | FLOORSMAX_AVG | 0.633005 |
| 1448 | LIVINGAREA_AVG | FLOORSMAX_AVG | 0.630354 |
| 3491 | TOTALAREA_MODE | FLOORSMAX_MEDI | 0.630352 |
| 2631 | BASEMENTAREA_MEDI | ENTRANCES_MODE | 0.629266 |
| 3211 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MODE | 0.629230 |
| 3012 | FLOORSMAX_MEDI | LIVINGAREA_AVG | 0.628339 |
| 3268 | LIVINGAREA_MEDI | FLOORSMAX_AVG | 0.626826 |
| 3296 | LIVINGAREA_MEDI | FLOORSMAX_MEDI | 0.626045 |
| 2102 | FLOORSMAX_MODE | LIVINGAREA_AVG | 0.626040 |
| 3477 | TOTALAREA_MODE | FLOORSMAX_MODE | 0.625672 |
| 1711 | BASEMENTAREA_MODE | LIVINGAPARTMENTS_AVG | 0.624821 |
| 2027 | ENTRANCES_MODE | BASEMENTAREA_AVG | 0.624532 |
| 3282 | LIVINGAREA_MEDI | FLOORSMAX_MODE | 0.623942 |
| 2371 | LIVINGAREA_MODE | ENTRANCES_MODE | 0.621320 |
| 2357 | LIVINGAREA_MODE | ENTRANCES_AVG | 0.621183 |
| 2961 | ENTRANCES_MEDI | LIVINGAREA_MODE | 0.620597 |
| 1181 | FLOORSMAX_AVG | APARTMENTS_AVG | 0.618626 |
| 3267 | LIVINGAREA_MEDI | ENTRANCES_AVG | 0.617373 |
| 3295 | LIVINGAREA_MEDI | ENTRANCES_MEDI | 0.617040 |
| 1447 | LIVINGAREA_AVG | ENTRANCES_AVG | 0.616662 |
| 3001 | FLOORSMAX_MEDI | APARTMENTS_AVG | 0.616419 |
| 2553 | APARTMENTS_MEDI | FLOORSMAX_AVG | 0.615086 |
| 2091 | FLOORSMAX_MODE | APARTMENTS_AVG | 0.614612 |
| 3029 | FLOORSMAX_MEDI | APARTMENTS_MEDI | 0.614135 |
| 2040 | ENTRANCES_MODE | APARTMENTS_MODE | 0.614042 |
| 2947 | ENTRANCES_MEDI | LIVINGAREA_AVG | 0.612807 |
| 2567 | APARTMENTS_MEDI | FLOORSMAX_MODE | 0.612492 |
| 2950 | ENTRANCES_MEDI | APARTMENTS_MODE | 0.610753 |
| 1642 | APARTMENTS_MODE | ENTRANCES_AVG | 0.610553 |
| 2552 | APARTMENTS_MEDI | ENTRANCES_AVG | 0.609686 |
| 2964 | ENTRANCES_MEDI | APARTMENTS_MEDI | 0.609468 |
| 1116 | ENTRANCES_AVG | APARTMENTS_AVG | 0.609416 |
| 2372 | LIVINGAREA_MODE | FLOORSMAX_MODE | 0.605787 |
| 2936 | ENTRANCES_MEDI | APARTMENTS_AVG | 0.605650 |
get_corr_matrix(dataset = pd_loan_train[list_var_continuous],
metodo='spearman', size_figure=[10,8])
0
También hemos hecho la matriz de spearman. Ambas matrices proporcionan resultados muy parecidos, por lo que llegamos a la conclusión de que es necesario realizar una simplificación de las variables para evitar problemas en el modelado, ya que muchas de nuestras variables no nos están aportando un valor añadido y nos hace el trabajo más complejo.
corr = pd_loan_train[list_var_continuous].corr('spearman')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1)
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 2744 | YEARS_BUILD_MEDI | YEARS_BUILD_AVG | 0.998416 |
| 2678 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_AVG | 0.997450 |
| 3694 | OBS_60_CNT_SOCIAL_CIRCLE | OBS_30_CNT_SOCIAL_CIRCLE | 0.997376 |
| 3140 | LANDAREA_MEDI | LANDAREA_AVG | 0.996411 |
| 3074 | FLOORSMIN_MEDI | FLOORSMIN_AVG | 0.996286 |
| 3206 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.996119 |
| 2810 | COMMONAREA_MEDI | COMMONAREA_AVG | 0.995749 |
| 3272 | LIVINGAREA_MEDI | LIVINGAREA_AVG | 0.995541 |
| 2546 | APARTMENTS_MEDI | APARTMENTS_AVG | 0.995248 |
| 3008 | FLOORSMAX_MEDI | FLOORSMAX_AVG | 0.994943 |
| 2612 | BASEMENTAREA_MEDI | BASEMENTAREA_AVG | 0.994792 |
| 2942 | ENTRANCES_MEDI | ENTRANCES_AVG | 0.993437 |
| 2876 | ELEVATORS_MEDI | ELEVATORS_AVG | 0.991171 |
| 2758 | YEARS_BUILD_MEDI | YEARS_BUILD_MODE | 0.987994 |
| 1834 | YEARS_BUILD_MODE | YEARS_BUILD_AVG | 0.987693 |
| 923 | YEARS_BUILD_AVG | YEARS_BEGINEXPLUATATION_AVG | 0.986985 |
| 3088 | FLOORSMIN_MEDI | FLOORSMIN_MODE | 0.986387 |
| 3022 | FLOORSMAX_MEDI | FLOORSMAX_MODE | 0.986375 |
| 2771 | YEARS_BUILD_MEDI | YEARS_BEGINEXPLUATATION_MEDI | 0.986048 |
| 2692 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_MODE | 0.985598 |
| 2743 | YEARS_BUILD_MEDI | YEARS_BEGINEXPLUATATION_AVG | 0.985557 |
| 1768 | YEARS_BEGINEXPLUATATION_MODE | YEARS_BEGINEXPLUATATION_AVG | 0.985229 |
| 196 | AMT_GOODS_PRICE | AMT_CREDIT | 0.984895 |
| 2679 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_AVG | 0.984651 |
| 2164 | FLOORSMIN_MODE | FLOORSMIN_AVG | 0.982697 |
| 3338 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_AVG | 0.981757 |
| 2098 | FLOORSMAX_MODE | FLOORSMAX_AVG | 0.981643 |
| 1847 | YEARS_BUILD_MODE | YEARS_BEGINEXPLUATATION_MODE | 0.980850 |
| 3404 | NONLIVINGAREA_MEDI | NONLIVINGAREA_AVG | 0.980823 |
| 2890 | ELEVATORS_MEDI | ELEVATORS_MODE | 0.977978 |
| 1833 | YEARS_BUILD_MODE | YEARS_BEGINEXPLUATATION_AVG | 0.975123 |
| 2693 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MODE | 0.974860 |
| 3220 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.973453 |
| 1376 | LIVINGAPARTMENTS_AVG | APARTMENTS_AVG | 0.973382 |
| 2824 | COMMONAREA_MEDI | COMMONAREA_MODE | 0.971104 |
| 3154 | LANDAREA_MEDI | LANDAREA_MODE | 0.970817 |
| 3224 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MEDI | 0.970764 |
| 2560 | APARTMENTS_MEDI | APARTMENTS_MODE | 0.969877 |
| 2757 | YEARS_BUILD_MEDI | YEARS_BEGINEXPLUATATION_MODE | 0.969857 |
| 1769 | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_AVG | 0.969559 |
| 3196 | LIVINGAPARTMENTS_MEDI | APARTMENTS_AVG | 0.969534 |
| 2296 | LIVINGAPARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.969473 |
| 1966 | ELEVATORS_MODE | ELEVATORS_AVG | 0.969201 |
| 3286 | LIVINGAREA_MEDI | LIVINGAREA_MODE | 0.969048 |
| 2556 | APARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.967592 |
| 2230 | LANDAREA_MODE | LANDAREA_AVG | 0.967388 |
| 1900 | COMMONAREA_MODE | COMMONAREA_AVG | 0.965749 |
| 2956 | ENTRANCES_MEDI | ENTRANCES_MODE | 0.965027 |
| 1636 | APARTMENTS_MODE | APARTMENTS_AVG | 0.964807 |
| 2362 | LIVINGAREA_MODE | LIVINGAREA_AVG | 0.964042 |
| 2626 | BASEMENTAREA_MEDI | BASEMENTAREA_MODE | 0.963008 |
| 1702 | BASEMENTAREA_MODE | BASEMENTAREA_AVG | 0.959573 |
| 2032 | ENTRANCES_MODE | ENTRANCES_AVG | 0.957727 |
| 2300 | LIVINGAPARTMENTS_MODE | APARTMENTS_MODE | 0.954970 |
| 3352 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_MODE | 0.951175 |
| 2570 | APARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.946656 |
| 2286 | LIVINGAPARTMENTS_MODE | APARTMENTS_AVG | 0.943402 |
| 3418 | NONLIVINGAREA_MEDI | NONLIVINGAREA_MODE | 0.941573 |
| 3467 | TOTALAREA_MODE | LIVINGAREA_AVG | 0.939764 |
| 3495 | TOTALAREA_MODE | LIVINGAREA_MEDI | 0.935193 |
| 3210 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MODE | 0.932577 |
| 2428 | NONLIVINGAPARTMENTS_MODE | NONLIVINGAPARTMENTS_AVG | 0.931496 |
| 1646 | APARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.928653 |
| 2494 | NONLIVINGAREA_MODE | NONLIVINGAREA_AVG | 0.917995 |
| 3481 | TOTALAREA_MODE | LIVINGAREA_MODE | 0.917386 |
| 1441 | LIVINGAREA_AVG | APARTMENTS_AVG | 0.906637 |
| 3289 | LIVINGAREA_MEDI | APARTMENTS_MEDI | 0.904824 |
| 3261 | LIVINGAREA_MEDI | APARTMENTS_AVG | 0.902739 |
| 2557 | APARTMENTS_MEDI | LIVINGAREA_AVG | 0.901828 |
| 3456 | TOTALAREA_MODE | APARTMENTS_AVG | 0.900361 |
| 1451 | LIVINGAREA_AVG | LIVINGAPARTMENTS_AVG | 0.897020 |
| 3484 | TOTALAREA_MODE | APARTMENTS_MEDI | 0.895120 |
| 2365 | LIVINGAREA_MODE | APARTMENTS_MODE | 0.893883 |
| 3299 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MEDI | 0.893729 |
| 3207 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_AVG | 0.893102 |
| 3271 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.891945 |
| 2571 | APARTMENTS_MEDI | LIVINGAREA_MODE | 0.878208 |
| 3466 | TOTALAREA_MODE | LIVINGAPARTMENTS_AVG | 0.876107 |
| 3470 | TOTALAREA_MODE | APARTMENTS_MODE | 0.876080 |
| 2375 | LIVINGAREA_MODE | LIVINGAPARTMENTS_MODE | 0.875014 |
| 3275 | LIVINGAREA_MEDI | APARTMENTS_MODE | 0.874976 |
| 2351 | LIVINGAREA_MODE | APARTMENTS_AVG | 0.874050 |
| 3494 | TOTALAREA_MODE | LIVINGAPARTMENTS_MEDI | 0.871992 |
| 1647 | APARTMENTS_MODE | LIVINGAREA_AVG | 0.870607 |
| 3285 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.869309 |
| 2297 | LIVINGAPARTMENTS_MODE | LIVINGAREA_AVG | 0.866596 |
| 3221 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MODE | 0.858764 |
| 2361 | LIVINGAREA_MODE | LIVINGAPARTMENTS_AVG | 0.855415 |
| 3480 | TOTALAREA_MODE | LIVINGAPARTMENTS_MODE | 0.853394 |
| 1186 | FLOORSMAX_AVG | ELEVATORS_AVG | 0.849515 |
| 3034 | FLOORSMAX_MEDI | ELEVATORS_MEDI | 0.845722 |
| 3760 | DEF_60_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | 0.845260 |
| 2878 | ELEVATORS_MEDI | FLOORSMAX_AVG | 0.844534 |
| 3006 | FLOORSMAX_MEDI | ELEVATORS_AVG | 0.842164 |
| 2110 | FLOORSMAX_MODE | ELEVATORS_MODE | 0.833303 |
| 3020 | FLOORSMAX_MEDI | ELEVATORS_MODE | 0.830893 |
| 131 | AMT_ANNUITY | AMT_CREDIT | 0.830145 |
| 1968 | ELEVATORS_MODE | FLOORSMAX_AVG | 0.829703 |
| 197 | AMT_GOODS_PRICE | AMT_ANNUITY | 0.828087 |
| 2892 | ELEVATORS_MEDI | FLOORSMAX_MODE | 0.826156 |
| 2096 | FLOORSMAX_MODE | ELEVATORS_AVG | 0.822290 |
| 1448 | LIVINGAREA_AVG | FLOORSMAX_AVG | 0.783432 |
| 3012 | FLOORSMAX_MEDI | LIVINGAREA_AVG | 0.782136 |
| 3296 | LIVINGAREA_MEDI | FLOORSMAX_MEDI | 0.780036 |
| 3463 | TOTALAREA_MODE | FLOORSMAX_AVG | 0.778940 |
| 3491 | TOTALAREA_MODE | FLOORSMAX_MEDI | 0.778108 |
| 3268 | LIVINGAREA_MEDI | FLOORSMAX_AVG | 0.778077 |
| 2102 | FLOORSMAX_MODE | LIVINGAREA_AVG | 0.776473 |
| 3477 | TOTALAREA_MODE | FLOORSMAX_MODE | 0.776253 |
| 3282 | LIVINGAREA_MEDI | FLOORSMAX_MODE | 0.774893 |
| 2372 | LIVINGAREA_MODE | FLOORSMAX_MODE | 0.760936 |
| 1181 | FLOORSMAX_AVG | APARTMENTS_AVG | 0.756658 |
| 3001 | FLOORSMAX_MEDI | APARTMENTS_AVG | 0.755424 |
| 3029 | FLOORSMAX_MEDI | APARTMENTS_MEDI | 0.752576 |
| 2091 | FLOORSMAX_MODE | APARTMENTS_AVG | 0.751576 |
| 2553 | APARTMENTS_MEDI | FLOORSMAX_AVG | 0.750744 |
| 2567 | APARTMENTS_MEDI | FLOORSMAX_MODE | 0.749187 |
| 3026 | FLOORSMAX_MEDI | LIVINGAREA_MODE | 0.747704 |
| 2358 | LIVINGAREA_MODE | FLOORSMAX_AVG | 0.744613 |
| 2105 | FLOORSMAX_MODE | APARTMENTS_MODE | 0.735494 |
| 1446 | LIVINGAREA_AVG | ELEVATORS_AVG | 0.728879 |
| 1383 | LIVINGAPARTMENTS_AVG | FLOORSMAX_AVG | 0.725581 |
| 2882 | ELEVATORS_MEDI | LIVINGAREA_AVG | 0.724450 |
| 3011 | FLOORSMAX_MEDI | LIVINGAPARTMENTS_AVG | 0.723351 |
| 3294 | LIVINGAREA_MEDI | ELEVATORS_MEDI | 0.721458 |
| 3231 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_MEDI | 0.721198 |
| 3203 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_AVG | 0.720724 |
| 3015 | FLOORSMAX_MEDI | APARTMENTS_MODE | 0.720197 |
| 3266 | LIVINGAREA_MEDI | ELEVATORS_AVG | 0.719683 |
| 1442 | LIVINGAREA_AVG | BASEMENTAREA_AVG | 0.719157 |
| 3461 | TOTALAREA_MODE | ELEVATORS_AVG | 0.718700 |
| 2101 | FLOORSMAX_MODE | LIVINGAPARTMENTS_AVG | 0.717937 |
| 1643 | APARTMENTS_MODE | FLOORSMAX_AVG | 0.717494 |
| 3217 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_MODE | 0.716286 |
| 3262 | LIVINGAREA_MEDI | BASEMENTAREA_AVG | 0.716269 |
| 3489 | TOTALAREA_MODE | ELEVATORS_MEDI | 0.715511 |
| 3290 | LIVINGAREA_MEDI | BASEMENTAREA_MEDI | 0.714780 |
| 2622 | BASEMENTAREA_MEDI | LIVINGAREA_AVG | 0.712863 |
| 3457 | TOTALAREA_MODE | BASEMENTAREA_AVG | 0.711425 |
| 1972 | ELEVATORS_MODE | LIVINGAREA_AVG | 0.710057 |
| 3280 | LIVINGAREA_MEDI | ELEVATORS_MODE | 0.708684 |
| 3475 | TOTALAREA_MODE | ELEVATORS_MODE | 0.706810 |
| 3485 | TOTALAREA_MODE | BASEMENTAREA_MEDI | 0.704676 |
| 2366 | LIVINGAREA_MODE | BASEMENTAREA_MODE | 0.702810 |
| 2307 | LIVINGAPARTMENTS_MODE | FLOORSMAX_MODE | 0.701770 |
| 2507 | NONLIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | 0.699190 |
| 791 | BASEMENTAREA_AVG | APARTMENTS_AVG | 0.699007 |
| 2636 | BASEMENTAREA_MEDI | LIVINGAREA_MODE | 0.697937 |
| 1051 | ELEVATORS_AVG | APARTMENTS_AVG | 0.697512 |
| 2352 | LIVINGAREA_MODE | BASEMENTAREA_AVG | 0.697120 |
| 2547 | APARTMENTS_MEDI | BASEMENTAREA_AVG | 0.696480 |
| 2639 | BASEMENTAREA_MEDI | APARTMENTS_MEDI | 0.695627 |
| 2871 | ELEVATORS_MEDI | APARTMENTS_AVG | 0.693965 |
| 2611 | BASEMENTAREA_MEDI | APARTMENTS_AVG | 0.693159 |
| 1377 | LIVINGAPARTMENTS_AVG | BASEMENTAREA_AVG | 0.692516 |
| 3025 | FLOORSMAX_MEDI | LIVINGAPARTMENTS_MODE | 0.692421 |
| 1381 | LIVINGAPARTMENTS_AVG | ELEVATORS_AVG | 0.692076 |
| 3197 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_AVG | 0.690960 |
| 2899 | ELEVATORS_MEDI | APARTMENTS_MEDI | 0.690799 |
| 2293 | LIVINGAPARTMENTS_MODE | FLOORSMAX_AVG | 0.690080 |
| 3431 | NONLIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | 0.689458 |
| 3225 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MEDI | 0.689018 |
| 2551 | APARTMENTS_MEDI | ELEVATORS_AVG | 0.688415 |
| 2370 | LIVINGAREA_MODE | ELEVATORS_MODE | 0.688238 |
| 2881 | ELEVATORS_MEDI | LIVINGAPARTMENTS_AVG | 0.687866 |
| 1583 | NONLIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | 0.686392 |
| 2621 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.686389 |
| 3229 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MEDI | 0.686381 |
| 3201 | LIVINGAPARTMENTS_MEDI | ELEVATORS_AVG | 0.685073 |
| 1715 | BASEMENTAREA_MODE | APARTMENTS_MODE | 0.683641 |
| 1961 | ELEVATORS_MODE | APARTMENTS_AVG | 0.681941 |
| 2565 | APARTMENTS_MEDI | ELEVATORS_MODE | 0.680355 |
| 3403 | NONLIVINGAREA_MEDI | NONLIVINGAPARTMENTS_AVG | 0.677969 |
| 2625 | BASEMENTAREA_MEDI | APARTMENTS_MODE | 0.677787 |
| 2287 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_AVG | 0.677529 |
| 2635 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.677384 |
| 1637 | APARTMENTS_MODE | BASEMENTAREA_AVG | 0.677007 |
| 3276 | LIVINGAREA_MEDI | BASEMENTAREA_MODE | 0.676060 |
| 2896 | ELEVATORS_MEDI | LIVINGAREA_MODE | 0.674796 |
| 1971 | ELEVATORS_MODE | LIVINGAPARTMENTS_AVG | 0.674192 |
| 3215 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MODE | 0.673663 |
| 2301 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_MODE | 0.673447 |
| 3339 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_AVG | 0.672388 |
| 3471 | TOTALAREA_MODE | BASEMENTAREA_MODE | 0.671685 |
| 1712 | BASEMENTAREA_MODE | LIVINGAREA_AVG | 0.670638 |
| 2356 | LIVINGAREA_MODE | ELEVATORS_AVG | 0.668383 |
| 1253 | FLOORSMIN_AVG | FLOORSMAX_AVG | 0.664174 |
| 1975 | ELEVATORS_MODE | APARTMENTS_MODE | 0.662330 |
| 3353 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MODE | 0.660924 |
| 3009 | FLOORSMAX_MEDI | FLOORSMIN_AVG | 0.660834 |
| 3101 | FLOORSMIN_MEDI | FLOORSMAX_MEDI | 0.660306 |
| 1117 | ENTRANCES_AVG | BASEMENTAREA_AVG | 0.659750 |
| 2561 | APARTMENTS_MEDI | BASEMENTAREA_MODE | 0.659651 |
| 2617 | BASEMENTAREA_MEDI | ENTRANCES_AVG | 0.659611 |
| 2305 | LIVINGAPARTMENTS_MODE | ELEVATORS_MODE | 0.658769 |
| 3073 | FLOORSMIN_MEDI | FLOORSMAX_AVG | 0.658606 |
| 3417 | NONLIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MODE | 0.658102 |
| 2965 | ENTRANCES_MEDI | BASEMENTAREA_MEDI | 0.658026 |
| 1701 | BASEMENTAREA_MODE | APARTMENTS_AVG | 0.653711 |
| 2937 | ENTRANCES_MEDI | BASEMENTAREA_AVG | 0.652067 |
| 2099 | FLOORSMAX_MODE | FLOORSMIN_AVG | 0.649632 |
| 3087 | FLOORSMIN_MEDI | FLOORSMAX_MODE | 0.649325 |
| 2895 | ELEVATORS_MEDI | LIVINGAPARTMENTS_MODE | 0.649242 |
| 3211 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MODE | 0.647185 |
| 2041 | ENTRANCES_MODE | BASEMENTAREA_MODE | 0.646627 |
| 2885 | ELEVATORS_MEDI | APARTMENTS_MODE | 0.645129 |
| 2291 | LIVINGAPARTMENTS_MODE | ELEVATORS_AVG | 0.644700 |
| 2493 | NONLIVINGAREA_MODE | NONLIVINGAPARTMENTS_AVG | 0.643280 |
| 2951 | ENTRANCES_MEDI | BASEMENTAREA_MODE | 0.642491 |
| 1711 | BASEMENTAREA_MODE | LIVINGAPARTMENTS_AVG | 0.642454 |
| 2177 | FLOORSMIN_MODE | FLOORSMAX_MODE | 0.642128 |
| 1707 | BASEMENTAREA_MODE | ENTRANCES_AVG | 0.641863 |
| 1641 | APARTMENTS_MODE | ELEVATORS_AVG | 0.639971 |
| 3023 | FLOORSMAX_MEDI | FLOORSMIN_MODE | 0.638397 |
| 2429 | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_AVG | 0.636192 |
| 2163 | FLOORSMIN_MODE | FLOORSMAX_AVG | 0.636170 |
| 3460 | TOTALAREA_MODE | COMMONAREA_AVG | 0.622166 |
| 2631 | BASEMENTAREA_MEDI | ENTRANCES_MODE | 0.621522 |
| 3488 | TOTALAREA_MODE | COMMONAREA_MEDI | 0.618277 |
| 2027 | ENTRANCES_MODE | BASEMENTAREA_AVG | 0.614280 |
| 1447 | LIVINGAREA_AVG | ENTRANCES_AVG | 0.609149 |
| 3267 | LIVINGAREA_MEDI | ENTRANCES_AVG | 0.608763 |
| 3295 | LIVINGAREA_MEDI | ENTRANCES_MEDI | 0.605690 |
| 2357 | LIVINGAREA_MODE | ENTRANCES_AVG | 0.602625 |
| 2961 | ENTRANCES_MEDI | LIVINGAREA_MODE | 0.601853 |
| 2947 | ENTRANCES_MEDI | LIVINGAREA_AVG | 0.600747 |
TRATAMIENTO DE LOS VALORES NULOS
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_loan[i].isnull().sum() > 0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]
.value_counts(normalize=True).reset_index()).T
# Verifica cuántas columnas tiene pd_concat_percent antes de asignarlas
if pd_concat_percent.shape[1] >= 2:
pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
pd_concat_percent.iloc[0, 1]]
elif pd_concat_percent.shape[1] == 1:
pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0]]
pd_concat_percent = pd_concat_percent.drop(target, axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum() / pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
get_percent_null_values_target(pd_loan_train, list_var_continuous, target='TARGET')
| 0.0 | variable | sum_null_values | porcentaje_sum_null_values | 1.0 | |
|---|---|---|---|---|---|
| 0 | 1.000000 | AMT_ANNUITY | 9 | 0.000037 | NaN |
| 1 | 0.921397 | AMT_GOODS_PRICE | 229 | 0.000931 | 0.078603 |
| 2 | 0.914689 | OWN_CAR_AGE | 162464 | 0.660401 | 0.085311 |
| 3 | 1.000000 | CNT_FAM_MEMBERS | 2 | 0.000008 | NaN |
| 4 | 0.914966 | EXT_SOURCE_1 | 138803 | 0.564221 | 0.085034 |
| 5 | 0.915709 | EXT_SOURCE_2 | 522 | 0.002122 | 0.084291 |
| 6 | 0.907393 | EXT_SOURCE_3 | 48787 | 0.198315 | 0.092607 |
| 7 | 0.908305 | APARTMENTS_AVG | 124849 | 0.507500 | 0.091695 |
| 8 | 0.910521 | BASEMENTAREA_AVG | 144000 | 0.585347 | 0.089479 |
| 9 | 0.907739 | YEARS_BEGINEXPLUATATION_AVG | 119997 | 0.487777 | 0.092261 |
| 10 | 0.913040 | YEARS_BUILD_AVG | 163592 | 0.664987 | 0.086960 |
| 11 | 0.914218 | COMMONAREA_AVG | 171913 | 0.698811 | 0.085782 |
| 12 | 0.908684 | ELEVATORS_AVG | 131073 | 0.532800 | 0.091316 |
| 13 | 0.907944 | ENTRANCES_AVG | 123816 | 0.503301 | 0.092056 |
| 14 | 0.907816 | FLOORSMAX_AVG | 122429 | 0.497663 | 0.092184 |
| 15 | 0.913647 | FLOORSMIN_AVG | 166943 | 0.678608 | 0.086353 |
| 16 | 0.911610 | LANDAREA_AVG | 146113 | 0.593936 | 0.088390 |
| 17 | 0.913778 | LIVINGAPARTMENTS_AVG | 168206 | 0.683742 | 0.086222 |
| 18 | 0.908343 | LIVINGAREA_AVG | 123471 | 0.501898 | 0.091657 |
| 19 | 0.914138 | NONLIVINGAPARTMENTS_AVG | 170833 | 0.694421 | 0.085862 |
| 20 | 0.909499 | NONLIVINGAREA_AVG | 135755 | 0.551832 | 0.090501 |
| 21 | 0.908305 | APARTMENTS_MODE | 124849 | 0.507500 | 0.091695 |
| 22 | 0.910521 | BASEMENTAREA_MODE | 144000 | 0.585347 | 0.089479 |
| 23 | 0.907739 | YEARS_BEGINEXPLUATATION_MODE | 119997 | 0.487777 | 0.092261 |
| 24 | 0.913040 | YEARS_BUILD_MODE | 163592 | 0.664987 | 0.086960 |
| 25 | 0.914218 | COMMONAREA_MODE | 171913 | 0.698811 | 0.085782 |
| 26 | 0.908684 | ELEVATORS_MODE | 131073 | 0.532800 | 0.091316 |
| 27 | 0.907944 | ENTRANCES_MODE | 123816 | 0.503301 | 0.092056 |
| 28 | 0.907816 | FLOORSMAX_MODE | 122429 | 0.497663 | 0.092184 |
| 29 | 0.913647 | FLOORSMIN_MODE | 166943 | 0.678608 | 0.086353 |
| 30 | 0.911610 | LANDAREA_MODE | 146113 | 0.593936 | 0.088390 |
| 31 | 0.913778 | LIVINGAPARTMENTS_MODE | 168206 | 0.683742 | 0.086222 |
| 32 | 0.908343 | LIVINGAREA_MODE | 123471 | 0.501898 | 0.091657 |
| 33 | 0.914138 | NONLIVINGAPARTMENTS_MODE | 170833 | 0.694421 | 0.085862 |
| 34 | 0.909499 | NONLIVINGAREA_MODE | 135755 | 0.551832 | 0.090501 |
| 35 | 0.908305 | APARTMENTS_MEDI | 124849 | 0.507500 | 0.091695 |
| 36 | 0.910521 | BASEMENTAREA_MEDI | 144000 | 0.585347 | 0.089479 |
| 37 | 0.907739 | YEARS_BEGINEXPLUATATION_MEDI | 119997 | 0.487777 | 0.092261 |
| 38 | 0.913040 | YEARS_BUILD_MEDI | 163592 | 0.664987 | 0.086960 |
| 39 | 0.914218 | COMMONAREA_MEDI | 171913 | 0.698811 | 0.085782 |
| 40 | 0.908684 | ELEVATORS_MEDI | 131073 | 0.532800 | 0.091316 |
| 41 | 0.907944 | ENTRANCES_MEDI | 123816 | 0.503301 | 0.092056 |
| 42 | 0.907816 | FLOORSMAX_MEDI | 122429 | 0.497663 | 0.092184 |
| 43 | 0.913647 | FLOORSMIN_MEDI | 166943 | 0.678608 | 0.086353 |
| 44 | 0.911610 | LANDAREA_MEDI | 146113 | 0.593936 | 0.088390 |
| 45 | 0.913778 | LIVINGAPARTMENTS_MEDI | 168206 | 0.683742 | 0.086222 |
| 46 | 0.908343 | LIVINGAREA_MEDI | 123471 | 0.501898 | 0.091657 |
| 47 | 0.914138 | NONLIVINGAPARTMENTS_MEDI | 170833 | 0.694421 | 0.085862 |
| 48 | 0.909499 | NONLIVINGAREA_MEDI | 135755 | 0.551832 | 0.090501 |
| 49 | 0.907445 | TOTALAREA_MODE | 118730 | 0.482627 | 0.092555 |
| 50 | 0.965599 | OBS_30_CNT_SOCIAL_CIRCLE | 843 | 0.003427 | 0.034401 |
| 51 | 0.965599 | DEF_30_CNT_SOCIAL_CIRCLE | 843 | 0.003427 | 0.034401 |
| 52 | 0.965599 | OBS_60_CNT_SOCIAL_CIRCLE | 843 | 0.003427 | 0.034401 |
| 53 | 0.965599 | DEF_60_CNT_SOCIAL_CIRCLE | 843 | 0.003427 | 0.034401 |
| 54 | 1.000000 | DAYS_LAST_PHONE_CHANGE | 1 | 0.000004 | NaN |
| 55 | 0.897417 | AMT_REQ_CREDIT_BUREAU_HOUR | 33251 | 0.135162 | 0.102583 |
| 56 | 0.897417 | AMT_REQ_CREDIT_BUREAU_DAY | 33251 | 0.135162 | 0.102583 |
| 57 | 0.897417 | AMT_REQ_CREDIT_BUREAU_WEEK | 33251 | 0.135162 | 0.102583 |
| 58 | 0.897417 | AMT_REQ_CREDIT_BUREAU_MON | 33251 | 0.135162 | 0.102583 |
| 59 | 0.897417 | AMT_REQ_CREDIT_BUREAU_QRT | 33251 | 0.135162 | 0.102583 |
| 60 | 0.897417 | AMT_REQ_CREDIT_BUREAU_YEAR | 33251 | 0.135162 | 0.102583 |
Vamos a cambiar los valores nulos de las columnas continuas por la mediana de los valores no nulos de la columna, se ha elegido la mediana ya que a diferencia de la media (promedio), la mediana no se ve afectada por valores atípicos o extremos en los datos. Al tener un dataset con variables con valores muy extremos (como se puede observar en los gráficos) la mediana es una mejor medida de tendencia central porque representa el punto medio de la distribución.
medianas_train = pd_loan_train[list_var_continuous].median()
pd_loan_train[list_var_continuous] = pd_loan_train[list_var_continuous].fillna(medianas_train)
pd_loan_test[list_var_continuous] = pd_loan_test[list_var_continuous].fillna(medianas_train)
Comprobamos que n existen valores nulos en las columnas
get_percent_null_values_target(pd_loan_train, list_var_continuous, target='TARGET')
No existen variables con valores nulos
TRATAMIENTO VARIABLES CATEGORICAS
list_var_cat
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
from scipy.stats import chi2_contingency
def cramers_v(matrix):
chi2, p, dof, ex = chi2_contingency(matrix) # Chi-squared test
return np.sqrt(chi2 / (matrix.sum().sum() * (min(matrix.shape) - 1)))
cramers_v_results = {}
for col in list_var_cat:
# Calculamos la matriz de confusión entre las variable categórica y TARGET
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train[col])
# Calculamos el Cramér's V para esta matriz
cramers_v_value = cramers_v(confusion_matrix.values)
# Guardamos el resultado en el diccionario
cramers_v_results[col] = cramers_v_value
cramers_v_df = pd.DataFrame(list(cramers_v_results.items()), columns=['Variable', 'Cramér\'s V'])
cramers_v_df = cramers_v_df.sort_values(by='Cramér\'s V', ascending=False)
print(cramers_v_df)
Variable Cramér's V 9 OCCUPATION_TYPE 0.078551 11 ORGANIZATION_TYPE 0.072094 5 NAME_INCOME_TYPE 0.063683 6 NAME_EDUCATION_TYPE 0.057060 1 CODE_GENDER 0.055116 14 WALLSMATERIAL_MODE 0.045075 15 EMERGENCYSTATE_MODE 0.043118 13 HOUSETYPE_MODE 0.041642 7 NAME_FAMILY_STATUS 0.040930 8 NAME_HOUSING_TYPE 0.038018 0 NAME_CONTRACT_TYPE 0.031849 12 FONDKAPREMONT_MODE 0.031243 2 FLAG_OWN_CAR 0.023452 4 NAME_TYPE_SUITE 0.012199 10 WEEKDAY_APPR_PROCESS_START 0.006439 3 FLAG_OWN_REALTY 0.006397
OPCIÓN 2
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
TARGET 0 1 TARGET 0 226148 0 1 0 19860
0.9999726128248573
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
cramers_v(confusion_matrix.values)
0.9999726128248573
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["OWN_CAR_AGE"])
cramers_v(confusion_matrix.values)
0.04159556349977331
pd_loan_train[list_var_cat] = pd_loan_train[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
pd_loan_test[list_var_cat] = pd_loan_test[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
print(pd_loan_test)
NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY \
SK_ID_CURR
344817 Cash loans F N Y
304895 Cash loans M N Y
274533 Cash loans F N Y
416053 Cash loans M Y Y
328656 Cash loans F N Y
... ... ... ... ...
314339 Cash loans F N Y
107600 Cash loans F N Y
342675 Cash loans F Y N
109967 Cash loans M N Y
326396 Cash loans F N N
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY \
SK_ID_CURR
344817 0 157500.0 584766.0 23319.0
304895 1 157500.0 781920.0 57028.5
274533 0 112500.0 652500.0 30366.0
416053 0 135000.0 823500.0 29304.0
328656 0 180000.0 675000.0 20668.5
... ... ... ... ...
314339 2 112500.0 696150.0 33619.5
107600 1 189000.0 244512.0 16047.0
342675 0 67500.0 900000.0 26446.5
109967 0 76500.0 247500.0 12168.0
326396 0 180000.0 225000.0 12204.0
AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE \
SK_ID_CURR
344817 472500.0 Unaccompanied Pensioner
304895 675000.0 Unaccompanied Working
274533 652500.0 Family Pensioner
416053 823500.0 Unaccompanied Working
328656 675000.0 Unaccompanied State servant
... ... ... ...
314339 562500.0 Unaccompanied Commercial associate
107600 216000.0 Family Commercial associate
342675 900000.0 Unaccompanied Pensioner
109967 247500.0 Unaccompanied Commercial associate
326396 225000.0 Unaccompanied Working
NAME_EDUCATION_TYPE NAME_FAMILY_STATUS \
SK_ID_CURR
344817 Secondary / secondary special Married
304895 Secondary / secondary special Married
274533 Secondary / secondary special Single / not married
416053 Incomplete higher Civil marriage
328656 Secondary / secondary special Single / not married
... ... ...
314339 Secondary / secondary special Civil marriage
107600 Secondary / secondary special Married
342675 Secondary / secondary special Married
109967 Secondary / secondary special Married
326396 Incomplete higher Single / not married
NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH \
SK_ID_CURR
344817 House / apartment 0.020713 -21399
304895 With parents 0.018029 -8704
274533 House / apartment 0.010500 -21103
416053 With parents 0.030755 -11442
328656 House / apartment 0.014520 -18708
... ... ... ...
314339 House / apartment 0.035792 -13185
107600 House / apartment 0.035792 -12913
342675 House / apartment 0.006305 -20172
109967 House / apartment 0.025164 -18737
326396 House / apartment 0.026392 -9988
DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE \
SK_ID_CURR
344817 365243 -270.0 -3821 9.0
304895 -278 -8445.0 -1148 9.0
274533 365243 -12482.0 -4655 9.0
416053 -622 -6053.0 -4076 24.0
328656 -4385 -4404.0 -2266 9.0
... ... ... ... ...
314339 -2072 -1959.0 -2330 9.0
107600 -2315 -3532.0 -4541 9.0
342675 365243 -6258.0 -3388 13.0
109967 -711 -5999.0 -2261 9.0
326396 -696 -4241.0 -2404 9.0
FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE \
SK_ID_CURR
344817 1 0 0 1
304895 1 1 0 1
274533 1 0 0 1
416053 1 1 0 1
328656 1 1 0 1
... ... ... ... ...
314339 1 1 0 1
107600 1 1 0 1
342675 1 0 0 1
109967 1 1 1 1
326396 1 1 0 1
FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS \
SK_ID_CURR
344817 0 0 SIN VALOR 2.0
304895 0 0 IT staff 3.0
274533 0 0 SIN VALOR 1.0
416053 0 0 Core staff 2.0
328656 0 0 Laborers 1.0
... ... ... ... ...
314339 0 0 SIN VALOR 4.0
107600 1 0 SIN VALOR 3.0
342675 0 0 SIN VALOR 2.0
109967 0 0 Laborers 2.0
326396 1 1 Core staff 1.0
REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY \
SK_ID_CURR
344817 3 2
304895 3 2
274533 3 3
416053 2 2
328656 2 2
... ... ...
314339 2 2
107600 2 2
342675 3 3
109967 2 2
326396 2 2
WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START \
SK_ID_CURR
344817 TUESDAY 8
304895 FRIDAY 11
274533 TUESDAY 12
416053 THURSDAY 14
328656 WEDNESDAY 8
... ... ...
314339 MONDAY 14
107600 TUESDAY 11
342675 TUESDAY 12
109967 WEDNESDAY 9
326396 FRIDAY 16
REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION \
SK_ID_CURR
344817 0 0
304895 0 0
274533 0 0
416053 0 0
328656 0 0
... ... ...
314339 0 0
107600 0 1
342675 0 0
109967 0 0
326396 0 0
LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY \
SK_ID_CURR
344817 0 0
304895 0 0
274533 0 0
416053 0 0
328656 0 0
... ... ...
314339 0 1
107600 1 0
342675 0 0
109967 0 0
326396 0 0
REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY \
SK_ID_CURR
344817 0 0
304895 0 0
274533 0 0
416053 0 0
328656 0 0
... ... ...
314339 1 0
107600 1 1
342675 0 0
109967 0 0
326396 0 0
ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 \
SK_ID_CURR
344817 XNA 0.815456 0.659799 0.633032
304895 Business Entity Type 3 0.307349 0.459206 0.535276
274533 XNA 0.773641 0.391290 0.584990
416053 Transport: type 2 0.381918 0.614966 0.137654
328656 School 0.506199 0.344191 0.535276
... ... ... ... ...
314339 Self-employed 0.357698 0.597733 0.384207
107600 Government 0.506199 0.293956 0.675413
342675 XNA 0.507980 0.638326 0.520898
109967 Business Entity Type 3 0.506199 0.381001 0.195198
326396 Restaurant 0.332320 0.606654 0.344155
APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG \
SK_ID_CURR
344817 0.2165 0.1626 0.9876
304895 0.1124 0.0408 0.9871
274533 0.0876 0.0764 0.9816
416053 0.0876 0.0764 0.9816
328656 0.0732 0.0764 0.9876
... ... ... ...
314339 0.0876 0.0764 0.9816
107600 0.0825 0.0793 0.9737
342675 0.1237 0.1208 0.9876
109967 0.0613 0.0405 0.9702
326396 0.0825 0.0814 0.9752
YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG \
SK_ID_CURR
344817 0.8300 0.0553 0.24 0.2069
304895 0.8232 0.0398 0.16 0.1379
274533 0.7552 0.0211 0.00 0.1379
416053 0.7552 0.0211 0.00 0.1379
328656 0.7552 0.0211 0.00 0.0345
... ... ... ... ...
314339 0.7552 0.0211 0.00 0.1379
107600 0.6396 0.0076 0.00 0.1379
342675 0.8300 0.0204 0.00 0.2759
109967 0.5920 0.0271 0.00 0.1552
326396 0.7552 0.0211 0.00 0.1379
FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG \
SK_ID_CURR
344817 0.3750 0.4167 0.0283 0.1765
304895 0.3750 0.3333 0.1164 0.0866
274533 0.1667 0.2083 0.0484 0.0756
416053 0.1667 0.2083 0.0484 0.0756
328656 0.1667 0.2083 0.0484 0.0756
... ... ... ... ...
314339 0.1667 0.2083 0.0484 0.0756
107600 0.1667 0.2083 0.0687 0.0672
342675 0.1667 0.2083 0.0305 0.0992
109967 0.1250 0.1667 0.0762 0.0458
326396 0.1667 0.2083 0.0484 0.0756
LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG \
SK_ID_CURR
344817 0.2588 0.0000 0.0000
304895 0.1333 0.0232 0.1594
274533 0.0746 0.0000 0.0036
416053 0.0746 0.0000 0.0036
328656 0.0746 0.0000 0.0036
... ... ... ...
314339 0.0746 0.0000 0.0036
107600 0.0704 0.0000 0.0000
342675 0.1211 0.0077 0.0573
109967 0.0650 0.0193 0.0458
326396 0.0725 0.0000 0.0000
APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE \
SK_ID_CURR
344817 0.2206 0.1688 0.9876
304895 0.1145 0.0424 0.9871
274533 0.0840 0.0747 0.9816
416053 0.0840 0.0747 0.9816
328656 0.0746 0.0747 0.9876
... ... ... ...
314339 0.0840 0.0747 0.9816
107600 0.0840 0.0823 0.9737
342675 0.1261 0.1254 0.9876
109967 0.0252 0.0190 0.9692
326396 0.0840 0.0845 0.9752
YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE \
SK_ID_CURR
344817 0.8367 0.0558 0.2417 0.2069
304895 0.8301 0.0401 0.1611 0.1379
274533 0.7648 0.0190 0.0000 0.1379
416053 0.7648 0.0190 0.0000 0.1379
328656 0.7648 0.0190 0.0000 0.0345
... ... ... ... ...
314339 0.7648 0.0190 0.0000 0.1379
107600 0.6537 0.0077 0.0000 0.1379
342675 0.8367 0.0205 0.0000 0.2759
109967 0.5949 0.0110 0.0000 0.0690
326396 0.7648 0.0190 0.0000 0.1379
FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE \
SK_ID_CURR
344817 0.3750 0.4167 0.0290
304895 0.3750 0.3333 0.1191
274533 0.1667 0.2083 0.0460
416053 0.1667 0.2083 0.0460
328656 0.1667 0.2083 0.0460
... ... ... ...
314339 0.1667 0.2083 0.0460
107600 0.1667 0.2083 0.0703
342675 0.1667 0.2083 0.0312
109967 0.0833 0.1250 0.0774
326396 0.1667 0.2083 0.0460
LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE \
SK_ID_CURR
344817 0.1928 0.2697 0.0000
304895 0.0946 0.1389 0.0233
274533 0.0771 0.0731 0.0000
416053 0.0771 0.0731 0.0000
328656 0.0771 0.0731 0.0000
... ... ... ...
314339 0.0771 0.0731 0.0000
107600 0.0735 0.0734 0.0000
342675 0.1084 0.1262 0.0078
109967 0.0220 0.0247 0.0000
326396 0.0771 0.0756 0.0000
NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI \
SK_ID_CURR
344817 0.0000 0.2186 0.1626
304895 0.1688 0.1135 0.0408
274533 0.0011 0.0874 0.0760
416053 0.0011 0.0874 0.0760
328656 0.0011 0.0739 0.0760
... ... ... ...
314339 0.0011 0.0874 0.0760
107600 0.0000 0.0833 0.0793
342675 0.0606 0.1249 0.1208
109967 0.0000 0.0619 0.0405
326396 0.0000 0.0833 0.0814
YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI \
SK_ID_CURR
344817 0.9876 0.8323 0.0557
304895 0.9871 0.8256 0.0400
274533 0.9816 0.7585 0.0209
416053 0.9816 0.7585 0.0209
328656 0.9876 0.7585 0.0209
... ... ... ...
314339 0.9816 0.7585 0.0209
107600 0.9737 0.6444 0.0077
342675 0.9876 0.8323 0.0205
109967 0.9702 0.5975 0.0273
326396 0.9752 0.7585 0.0209
ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI \
SK_ID_CURR
344817 0.24 0.2069 0.3750 0.4167
304895 0.16 0.1379 0.3750 0.3333
274533 0.00 0.1379 0.1667 0.2083
416053 0.00 0.1379 0.1667 0.2083
328656 0.00 0.0345 0.1667 0.2083
... ... ... ... ...
314339 0.00 0.1379 0.1667 0.2083
107600 0.00 0.1379 0.1667 0.2083
342675 0.00 0.2759 0.1667 0.2083
109967 0.00 0.1552 0.1250 0.1667
326396 0.00 0.1379 0.1667 0.2083
LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI \
SK_ID_CURR
344817 0.0288 0.1796 0.2635
304895 0.1185 0.0881 0.1357
274533 0.0489 0.0765 0.0750
416053 0.0489 0.0765 0.0750
328656 0.0489 0.0765 0.0750
... ... ... ...
314339 0.0489 0.0765 0.0750
107600 0.0699 0.0684 0.0717
342675 0.0310 0.1009 0.1233
109967 0.0776 0.0466 0.0662
326396 0.0489 0.0765 0.0738
NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE \
SK_ID_CURR
344817 0.0000 0.0000 reg oper account
304895 0.0233 0.1628 reg oper account
274533 0.0000 0.0031 SIN VALOR
416053 0.0000 0.0031 SIN VALOR
328656 0.0000 0.0031 SIN VALOR
... ... ... ...
314339 0.0000 0.0031 SIN VALOR
107600 0.0000 0.0000 reg oper account
342675 0.0078 0.0585 reg oper account
109967 0.0194 0.0467 reg oper account
326396 0.0000 0.0000 SIN VALOR
HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE \
SK_ID_CURR
344817 block of flats 0.2338 Panel
304895 block of flats 0.1613 Panel
274533 SIN VALOR 0.0688 SIN VALOR
416053 SIN VALOR 0.0688 SIN VALOR
328656 SIN VALOR 0.0483 SIN VALOR
... ... ... ...
314339 SIN VALOR 0.0688 SIN VALOR
107600 block of flats 0.0554 Panel
342675 block of flats 0.1077 Panel
109967 block of flats 0.0246 Stone, brick
326396 block of flats 0.0570 Panel
EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE \
SK_ID_CURR
344817 No 0.0
304895 No 1.0
274533 SIN VALOR 0.0
416053 SIN VALOR 0.0
328656 No 1.0
... ... ...
314339 SIN VALOR 6.0
107600 No 4.0
342675 No 0.0
109967 No 1.0
326396 No 0.0
DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE \
SK_ID_CURR
344817 0.0 0.0
304895 1.0 1.0
274533 0.0 0.0
416053 0.0 0.0
328656 0.0 1.0
... ... ...
314339 0.0 6.0
107600 0.0 4.0
342675 0.0 0.0
109967 0.0 1.0
326396 0.0 0.0
DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 \
SK_ID_CURR
344817 0.0 -1068.0 0
304895 1.0 -805.0 0
274533 0.0 0.0 0
416053 0.0 -860.0 0
328656 0.0 0.0 0
... ... ... ...
314339 0.0 0.0 0
107600 0.0 0.0 0
342675 0.0 -456.0 0
109967 0.0 -656.0 0
326396 0.0 -328.0 0
FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 \
SK_ID_CURR
344817 1 0 0
304895 1 0 0
274533 0 0 0
416053 1 0 0
328656 1 0 0
... ... ... ...
314339 1 0 0
107600 1 0 0
342675 0 0 0
109967 1 0 0
326396 1 0 0
FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 \
SK_ID_CURR
344817 0 0 0
304895 0 0 0
274533 1 0 0
416053 0 0 0
328656 0 0 0
... ... ... ...
314339 0 0 0
107600 0 0 0
342675 1 0 0
109967 0 0 0
326396 0 0 0
FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 \
SK_ID_CURR
344817 0 0 0
304895 0 0 0
274533 0 0 0
416053 0 0 0
328656 0 0 0
... ... ... ...
314339 0 0 0
107600 0 0 0
342675 0 0 0
109967 0 0 0
326396 0 0 0
FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 \
SK_ID_CURR
344817 0 0 0
304895 0 0 0
274533 0 0 0
416053 0 0 0
328656 0 0 0
... ... ... ...
314339 0 0 0
107600 0 0 0
342675 0 0 0
109967 0 0 0
326396 0 0 0
FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 \
SK_ID_CURR
344817 0 0 0
304895 0 0 0
274533 0 0 0
416053 0 0 0
328656 0 0 0
... ... ... ...
314339 0 0 0
107600 0 0 0
342675 0 0 0
109967 0 0 0
326396 0 0 0
FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 \
SK_ID_CURR
344817 0 0 0
304895 0 0 0
274533 0 0 0
416053 0 0 0
328656 0 0 0
... ... ... ...
314339 0 0 0
107600 0 0 0
342675 0 0 0
109967 0 0 0
326396 0 0 0
FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR \
SK_ID_CURR
344817 0 0.0
304895 0 0.0
274533 0 0.0
416053 0 0.0
328656 0 0.0
... ... ...
314339 0 0.0
107600 0 0.0
342675 0 0.0
109967 0 0.0
326396 0 0.0
AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK \
SK_ID_CURR
344817 0.0 0.0
304895 0.0 0.0
274533 0.0 0.0
416053 0.0 0.0
328656 0.0 0.0
... ... ...
314339 0.0 0.0
107600 0.0 0.0
342675 0.0 0.0
109967 0.0 0.0
326396 0.0 0.0
AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT \
SK_ID_CURR
344817 0.0 1.0
304895 0.0 0.0
274533 0.0 0.0
416053 1.0 0.0
328656 0.0 0.0
... ... ...
314339 0.0 0.0
107600 0.0 0.0
342675 0.0 1.0
109967 0.0 0.0
326396 1.0 1.0
AMT_REQ_CREDIT_BUREAU_YEAR TARGET
SK_ID_CURR
344817 4.0 0
304895 1.0 1
274533 2.0 0
416053 1.0 0
328656 1.0 0
... ... ...
314339 1.0 0
107600 1.0 0
342675 0.0 0
109967 0.0 1
326396 1.0 0
[61503 rows x 121 columns]
pd_loan_train.isna().sum().sum()
0
pd_loan_train.to_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
pd_loan_test.to_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")